In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv('student_performance.csv')
print(df.head())
print(df.info())  # Check data types and missing values


In [None]:
print(df.isnull().sum())


In [None]:
num_cols = ['StudyHours', 'Attendance', 'PreviousGrades']
imputer_num = SimpleImputer(strategy='mean')
df[num_cols] = imputer_num.fit_transform(df[num_cols])


In [None]:
cat_cols = ['ParentalEducation', 'ExtracurricularActivities']
imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])


In [None]:
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [None]:
X = df.drop('FinalGrade', axis=1)
y = df['FinalGrade']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Train size: {X_train.shape}, Test size: {X_test.shape}')


In [None]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


In [None]:
print(X_train.head())
print(y_train.head())


In [None]:
train_data = X_train.copy()
train_data['FinalGrade'] = y_train
train_data.to_csv('student_performance_preprocessed_train.csv', index=False)
