In [None]:
# Students Grading Dataset Preprocessing Notebook
# Steps: Cleaning, Integration, Reduction, Transformation, Discretization

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

# 0) Load dataset
# Download 'StudentsPerformance.csv' from Kaggle and place it here
df = pd.read_csv('StudentsPerformance.csv')
df.head()

In [None]:
# 1) Data Cleaning
print('Missing values before:', df.isnull().sum(), '\n')
df = df.drop_duplicates()
imputer = SimpleImputer(strategy='mean')
df[['math score','reading score','writing score']] = imputer.fit_transform(df[['math score','reading score','writing score']])
print('Missing values after:', df.isnull().sum())

In [None]:
# 2) Data Integration
import pandas as pd
mapping_df = pd.DataFrame({
    'test preparation course': ['none', 'completed'],
    'prep_binary': [0, 1]
})
df = df.merge(mapping_df, on='test preparation course', how='left')
df.head()

In [None]:
# 3) Data Reduction (PCA)
features = ['math score','reading score','writing score','prep_binary']
pca = PCA(n_components=2)
pc = pca.fit_transform(df[features])
pc_df = pd.DataFrame(pc, columns=['PC1','PC2'])
df = pd.concat([df, pc_df], axis=1)
print('Explained variance ratio:', pca.explained_variance_ratio_)

In [None]:
# 4) Data Transformation (Normalization)
scaler = MinMaxScaler()
df[['math_score_norm','reading_score_norm','writing_score_norm']] = scaler.fit_transform(df[['math score','reading score','writing score']])
df.head()

In [None]:
# 5) Data Discretization
# Calculate total
df['total_score'] = df['math score'] + df['reading score'] + df['writing score']
# Bin into letter grades
df['grade'] = pd.cut(df['total_score'], bins=[0, 180, 240, 300], labels=['C','B','A'])
df.head()