In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

CSV_PATH = Path("../data/student_data.csv")

df = pd.read_csv(CSV_PATH, sep=";")

df.head()


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [2]:
def to_snake(s: str) -> str:
    return (
        s.strip()
        .replace("/", " ")
        .replace("-", " ")
        .replace(".", " ")
        .replace("(", " ")
        .replace(")", " ")
        .replace("’", "")
        .replace("'", "")
        .lower()
        .replace(" ", "_")
    )

df.columns = [to_snake(c) for c in df.columns]

df.columns


Index(['marital_status', 'application_mode', 'application_order', 'course',
       'daytime_evening_attendance', 'previous_qualification',
       'previous_qualification__grade_', 'nacionality',
       'mothers_qualification', 'fathers_qualification', 'mothers_occupation',
       'fathers_occupation', 'admission_grade', 'displaced',
       'educational_special_needs', 'debtor', 'tuition_fees_up_to_date',
       'gender', 'scholarship_holder', 'age_at_enrollment', 'international',
       'curricular_units_1st_sem__credited_',
       'curricular_units_1st_sem__enrolled_',
       'curricular_units_1st_sem__evaluations_',
       'curricular_units_1st_sem__approved_',
       'curricular_units_1st_sem__grade_',
       'curricular_units_1st_sem__without_evaluations_',
       'curricular_units_2nd_sem__credited_',
       'curricular_units_2nd_sem__enrolled_',
       'curricular_units_2nd_sem__evaluations_',
       'curricular_units_2nd_sem__approved_',
       'curricular_units_2nd_sem__grade_'

In [3]:
df.isna().sum().sort_values(ascending=False)


marital_status                                    0
application_mode                                  0
application_order                                 0
course                                            0
daytime_evening_attendance                        0
previous_qualification                            0
previous_qualification__grade_                    0
nacionality                                       0
mothers_qualification                             0
fathers_qualification                             0
mothers_occupation                                0
fathers_occupation                                0
admission_grade                                   0
displaced                                         0
educational_special_needs                         0
debtor                                            0
tuition_fees_up_to_date                           0
gender                                            0
scholarship_holder                                0
age_at_enrol

In [4]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numerik:", len(num_cols))
print(num_cols[:10])
print("\nKategorikal:", len(cat_cols))
print(cat_cols[:10])


Numerik: 36
['marital_status', 'application_mode', 'application_order', 'course', 'daytime_evening_attendance', 'previous_qualification', 'previous_qualification__grade_', 'nacionality', 'mothers_qualification', 'fathers_qualification']

Kategorikal: 1
['target']


In [5]:
target_col = "target"
df[target_col].unique()


array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [6]:
X = df.drop(columns=[target_col])
y = df[target_col]


In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((3539, 36), (885, 36))