In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [3]:
df = pd.read_csv("student-mat.csv", sep=';')


In [4]:
df['G3_binary'] = (df['G3'] >= 15).astype(int)
df = df.drop(columns=['G1', 'G2', 'G3'])  # Drop raw grade columns


In [5]:
target = 'G3_binary'

categorical_features = [
    'school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
    'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities',
    'nursery', 'higher', 'internet', 'romantic'
]

numeric_features = df.drop(columns=categorical_features + [target]).columns.tolist()


In [6]:
train_df, test_df = train_test_split(df, test_size=0.3, stratify=df[target], random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, stratify=test_df[target], random_state=42)


In [7]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [8]:
X_train = train_df.drop(columns=target)
y_train = train_df[target]

X_val = val_df.drop(columns=target)
y_val = val_df[target]

X_test = test_df.drop(columns=target)
y_test = test_df[target]

X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)
X_test_proc = preprocessor.transform(X_test)


In [9]:
cat_cols = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
processed_cols = np.concatenate([numeric_features, cat_cols])

X_train_df = pd.DataFrame(X_train_proc.toarray() if hasattr(X_train_proc, "toarray") else X_train_proc, columns=processed_cols)
X_val_df = pd.DataFrame(X_val_proc.toarray() if hasattr(X_val_proc, "toarray") else X_val_proc, columns=processed_cols)
X_test_df = pd.DataFrame(X_test_proc.toarray() if hasattr(X_test_proc, "toarray") else X_test_proc, columns=processed_cols)


In [10]:
X_train_df[target] = y_train.reset_index(drop=True)
X_val_df[target] = y_val.reset_index(drop=True)
X_test_df[target] = y_test.reset_index(drop=True)


In [11]:
os.makedirs("processed_data", exist_ok=True)

X_train_df.to_csv("processed_data/train.csv", index=False)
X_val_df.to_csv("processed_data/val.csv", index=False)
X_test_df.to_csv("processed_data/test.csv", index=False)

print("✅ Preprocessing complete. Files saved to 'processed_data/'")


✅ Preprocessing complete. Files saved to 'processed_data/'
