In [41]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

# Read the data from the CSV file
data = pd.read_csv("Students_Grading_Dataset.csv")
data.head()

Unnamed: 0,Student_ID,First_Name,Last_Name,Email,Gender,Age,Department,Attendance (%),Midterm_Score,Final_Score,Assignments_Avg,Quizzes_Avg,Participation_Score,Projects_Score,Total_Score,Grade,Study_Hours_per_Week,Extracurricular_Activities,Internet_Access_at_Home,Parent_Education_Level,Family_Income_Level,Stress_Level (1-10),Sleep_Hours_per_Night
0,S1000,Omar,Williams,student0@university.com,Female,22,Engineering,52.29,55.03,57.82,84.22,74.06,3.99,85.9,56.09,F,6.2,No,Yes,High School,Medium,5,4.7
1,S1001,Maria,Brown,student1@university.com,Male,18,Engineering,97.27,97.23,45.8,,94.24,8.32,55.65,50.64,A,19.0,No,Yes,,Medium,4,9.0
2,S1002,Ahmed,Jones,student2@university.com,Male,24,Business,57.19,67.05,93.68,67.7,85.7,5.05,73.79,70.3,D,20.7,No,Yes,Master's,Low,6,6.2
3,S1003,Omar,Williams,student3@university.com,Female,24,Mathematics,95.15,47.79,80.63,66.06,93.51,6.54,92.12,61.63,A,24.8,Yes,Yes,High School,High,3,6.7
4,S1004,John,Smith,student4@university.com,Female,23,CS,54.18,46.59,78.89,96.85,83.7,5.97,68.42,66.13,F,15.4,Yes,Yes,High School,High,2,7.1


In [42]:
# Drop uneccessary columns
data = data.drop(columns = ["Student_ID", "First_Name", "Last_Name", "Email"])

In [43]:
# Check for missing values
print(data.isnull().sum())

# Fill missing values with the mean of the column for same stress level value
data["Attendance (%)"] = data.groupby("Stress_Level (1-10)")["Attendance (%)"].transform(lambda x: x.fillna(x.mean()))
data["Assignments_Avg"] = data.groupby("Stress_Level (1-10)")["Assignments_Avg"].transform(lambda x: x.fillna(x.mean()))

# Fill missing Parent_Education_Level with a new category "Unknown"
data["Parent_Education_Level"] = data["Parent_Education_Level"].fillna("Unknown")


Gender                           0
Age                              0
Department                       0
Attendance (%)                 516
Midterm_Score                    0
Final_Score                      0
Assignments_Avg                517
Quizzes_Avg                      0
Participation_Score              0
Projects_Score                   0
Total_Score                      0
Grade                            0
Study_Hours_per_Week             0
Extracurricular_Activities       0
Internet_Access_at_Home          0
Parent_Education_Level        1794
Family_Income_Level              0
Stress_Level (1-10)              0
Sleep_Hours_per_Night            0
dtype: int64


In [44]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Define the columns to be encoded
ordinalEncodingColumns = {
    "Grade": ['F', 'D', 'C', 'B', 'A'],
    "Parent_Education_Level": ['None', 'High School', 'Unknown', "Bachelor's", "Master's", 'PhD'],
    "Family_Income_Level": ['Low', 'Medium', 'High']
}
booleanColumns = ["Extracurricular_Activities", "Internet_Access_at_Home"]

# Create the column transformer
ct = ColumnTransformer([
    ("onehot_gender", OneHotEncoder(drop="first"), ["Gender"]),
    ("onehot_department", OneHotEncoder(drop=None), ["Department"]),
    ("ordinal", OrdinalEncoder(categories=list(ordinalEncodingColumns.values())), list(ordinalEncodingColumns.keys())),
], remainder='passthrough')

# Convert Extracurricular_Activities and Internet_Access_at_Home to boolean
data[booleanColumns] = data[booleanColumns].map(lambda x: True if x == "Yes" else False)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,22,52.29,55.03,57.82,84.22,74.06,3.99,85.9,56.09,6.2,False,True,5,4.7
1,1.0,0.0,0.0,1.0,0.0,4.0,2.0,1.0,18,97.27,97.23,45.8,74.121996,94.24,8.32,55.65,50.64,19.0,False,True,4,9.0
2,1.0,1.0,0.0,0.0,0.0,1.0,4.0,0.0,24,57.19,67.05,93.68,67.7,85.7,5.05,73.79,70.3,20.7,False,True,6,6.2
3,0.0,0.0,0.0,0.0,1.0,4.0,1.0,2.0,24,95.15,47.79,80.63,66.06,93.51,6.54,92.12,61.63,24.8,True,True,3,6.7
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,23,54.18,46.59,78.89,96.85,83.7,5.97,68.42,66.13,15.4,True,True,2,7.1
