In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)
import pickle

# =========================
# 1. Load and clean dataset
# =========================

# Adjust path if needed
df = pd.read_csv("data.csv", sep=";")

# Clean column names: lowercase, replace spaces/special chars with underscores
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(r"[^0-9a-zA-Z]+", "_", regex=True)
      .str.strip("_")
)

print("Columns:", df.columns.tolist())
print(df.head())

Columns: ['marital_status', 'application_mode', 'application_order', 'course', 'daytime_evening_attendance', 'previous_qualification', 'previous_qualification_grade', 'nacionality', 'mother_s_qualification', 'father_s_qualification', 'mother_s_occupation', 'father_s_occupation', 'admission_grade', 'displaced', 'educational_special_needs', 'debtor', 'tuition_fees_up_to_date', 'gender', 'scholarship_holder', 'age_at_enrollment', 'international', 'curricular_units_1st_sem_credited', 'curricular_units_1st_sem_enrolled', 'curricular_units_1st_sem_evaluations', 'curricular_units_1st_sem_approved', 'curricular_units_1st_sem_grade', 'curricular_units_1st_sem_without_evaluations', 'curricular_units_2nd_sem_credited', 'curricular_units_2nd_sem_enrolled', 'curricular_units_2nd_sem_evaluations', 'curricular_units_2nd_sem_approved', 'curricular_units_2nd_sem_grade', 'curricular_units_2nd_sem_without_evaluations', 'unemployment_rate', 'inflation_rate', 'gdp', 'target']
   marital_status  application

In [12]:
# =========================
# 2. Separate features/target
# =========================

target_col = "target"
X = df.drop(target_col, axis=1)
y = df[target_col]

print("\nTarget class distribution:")
print(y.value_counts())


Target class distribution:
target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64


In [13]:
# =====================================
# 3. Train / Test split (stratified)
# =====================================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain size:", X_train.shape, "Test size:", X_test.shape)


Train size: (3539, 36) Test size: (885, 36)


In [14]:
# ==========================================
# 4. Define models (all inside Pipelines)
# ==========================================

models = {
    "logistic_regression": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            max_iter=1000,
            multi_class="multinomial",
            n_jobs=-1
        ))
    ]),
    "random_forest": Pipeline([
        ("scaler", StandardScaler()),  # scaling is not necessary for RF but ok
        ("clf", RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            n_jobs=-1
        ))
    ]),
    "gradient_boosting": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", GradientBoostingClassifier(
            random_state=42
        ))
    ]),
    "svc_rbf": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(
            kernel="rbf",
            probability=True,
            random_state=42
        ))
    ])
}


In [15]:
# ==========================================
# 5. Train & evaluate each model
# ==========================================

results = []

best_model_name = None
best_model_obj = None
best_f1_macro = -1.0  # we’ll select based on F1 macro

for name, model in models.items():
    print(f"\n======================")
    print(f"Training model: {name}")
    print(f"======================")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)

    print(f"Accuracy      : {acc:.4f}")
    print(f"Precision (macro): {prec:.4f}")
    print(f"Recall (macro)   : {rec:.4f}")
    print(f"F1-score (macro) : {f1:.4f}")

    print("\nClassification report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    results.append({
        "model": name,
        "accuracy": acc,
        "precision_macro": prec,
        "recall_macro": rec,
        "f1_macro": f1
    })

    # track the best model by F1 macro
    if f1 > best_f1_macro:
        best_f1_macro = f1
        best_model_name = name
        best_model_obj = model

# Show summary table
results_df = pd.DataFrame(results).sort_values(by="f1_macro", ascending=False)
print("\n======= Summary of models (sorted by F1 macro) =======")
print(results_df)

print(f"\nBest model based on F1 macro: {best_model_name} (F1_macro = {best_f1_macro:.4f})")


Training model: logistic_regression




Accuracy      : 0.7684
Precision (macro): 0.7070
Recall (macro)   : 0.6754
F1-score (macro) : 0.6826

Classification report:
              precision    recall  f1-score   support

     Dropout       0.79      0.77      0.78       284
    Enrolled       0.52      0.33      0.41       159
    Graduate       0.80      0.93      0.86       442

    accuracy                           0.77       885
   macro avg       0.71      0.68      0.68       885
weighted avg       0.75      0.77      0.75       885

Confusion matrix:
[[218  29  37]
 [ 43  53  63]
 [ 14  19 409]]

Training model: random_forest
Accuracy      : 0.7672
Precision (macro): 0.7153
Recall (macro)   : 0.6748
F1-score (macro) : 0.6842

Classification report:
              precision    recall  f1-score   support

     Dropout       0.80      0.76      0.78       284
    Enrolled       0.56      0.34      0.42       159
    Graduate       0.79      0.93      0.85       442

    accuracy                           0.77       885
  

In [16]:
# best model using F1 macro (better for multi-class, possibly imbalanced targets),

In [17]:
# =====================================================
# 6. Retrain the best model on the FULL dataset (X, y)
# =====================================================

print(f"\nRetraining the best model ({best_model_name}) on the full dataset...")
best_model_obj.fit(X, y)


Retraining the best model (gradient_boosting) on the full dataset...


In [18]:
# =================================
# 7. Save the best model using pickle
# =================================

model_filename = "best_student_dropout_model.pkl"
with open(model_filename, "wb") as f:
    pickle.dump(best_model_obj, f)

print(f"\nBest model saved to: {model_filename}")


Best model saved to: best_student_dropout_model.pkl


In [19]:
# ================================
# 8. (Optional) Example: load and use
# ================================

# Example of how you would load this later:
# with open("best_student_dropout_model.pkl", "rb") as f:
#     loaded_model = pickle.load(f)
# new_pred = loaded_model.predict(new_data_df)
