In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, RocCurveDisplay, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
import joblib


In [2]:
# ================================
# Load Dataset
# ================================
df = pd.read_csv("student_dropout.csv")

target = "Dropped_Out"
X = df.drop(columns=[target, "School"])
y = df[target]

categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns

In [3]:
# ================================
# Preprocessing Pipeline
# ================================
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numerical_cols)
    ]
)

# Temporary model to find important features
rf_tmp = Pipeline([
    ("prep", preprocess),
    ("model", RandomForestClassifier(n_estimators=300, random_state=42))
])

rf_tmp.fit(X, y)

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [4]:
# ================================
# Feature Importance Extraction
# ================================
ohe = rf_tmp.named_steps["prep"].named_transformers_["cat"]
feature_names = list(ohe.get_feature_names_out(categorical_cols)) + list(numerical_cols)

importances = rf_tmp.named_steps["model"].feature_importances_

feat_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

top30 = feat_df.head(30)["Feature"].tolist()

# Transform full dataset
X_transformed = rf_tmp.named_steps["prep"].transform(X)
X_transformed = pd.DataFrame(X_transformed, columns=feature_names)

X_selected = X_transformed[top30]

In [5]:
# ================================
# Balanced Training Data
# ================================
sm = SMOTE(random_state=42)
X_bal, y_bal = sm.fit_resample(X_selected, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_bal, y_bal, test_size=0.2, random_state=42, stratify=y_bal
)

In [6]:
# ================================
# Models to Evaluate
# ================================
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(
        n_estimators=300, max_depth=6,
        min_samples_split=10, min_samples_leaf=4, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(
        max_depth=5, min_samples_split=10, min_samples_leaf=4, random_state=42
    ),
    "SVM": SVC(probability=True, C=1, kernel="rbf"),
    "XGBoost": XGBClassifier(
        eval_metric='logloss',
        learning_rate=0.05, max_depth=4,
        n_estimators=300, subsample=0.8,
        colsample_bytree=0.8, random_state=42
    )
}

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
best_model = None
best_acc = 0

print("\n===== Cross Validation Scores =====")
for name, model in models.items():
    scores = cross_val_score(model, X_bal, y_bal, cv=skf, scoring="accuracy")
    print(f"{name}: Mean={scores.mean():.4f}, Std={scores.std():.4f}")


===== Cross Validation Scores =====
LogisticRegression: Mean=0.9891, Std=0.0080
RandomForest: Mean=1.0000, Std=0.0000
DecisionTree: Mean=1.0000, Std=0.0000
SVM: Mean=0.9854, Std=0.0061
XGBoost: Mean=1.0000, Std=0.0000


In [7]:
# ================================
# Test and Compare Models
# ================================
print("\n===== Model Testing =====")
for name, model in models.items():
    print(f"\n>>> Testing {name}")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)

    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))

    if acc > best_acc:
        best_acc = acc
        best_model = model

print("\nBest model:", best_model)


===== Model Testing =====

>>> Testing LogisticRegression
Accuracy: 0.9864
              precision    recall  f1-score   support

       False       1.00      0.97      0.99       110
        True       0.97      1.00      0.99       110

    accuracy                           0.99       220
   macro avg       0.99      0.99      0.99       220
weighted avg       0.99      0.99      0.99       220


>>> Testing RandomForest
Accuracy: 1.0000
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       110
        True       1.00      1.00      1.00       110

    accuracy                           1.00       220
   macro avg       1.00      1.00      1.00       220
weighted avg       1.00      1.00      1.00       220


>>> Testing DecisionTree
Accuracy: 1.0000
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       110
        True       1.00      1.00      1.00       110

    accuracy        

In [8]:
# ================================
# Final Pipeline for Deployment
# ================================
final_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("feature_select", SelectFromModel(
        RandomForestClassifier(n_estimators=300, random_state=42),
        threshold=-np.inf, max_features=30
    )),
    ("classifier", best_model)
])

final_pipeline.fit(X, y)

joblib.dump(final_pipeline, "dropout_predictor_pipeline.pkl")
print("\nModel saved as dropout_predictor_pipeline.pkl")


Model saved as dropout_predictor_pipeline.pkl
