## student-dropout-and-success-prediction

### Train and save model

In [1]:
# import required libraries
import numpy as np
import pandas as pd

import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer

In [2]:
# load data
df = pd.read_csv("../data/dataset.csv")

df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('/', '_')
df.rename(columns={'nacionality':'nationality'}, inplace=True)

df['target'] = df['target'].map({
    'Dropout':0,
    'Enrolled':1,
    'Graduate':2
})

In [3]:
features = ['curricular_units_2nd_sem_(approved)',
       'curricular_units_2nd_sem_(grade)',
       'curricular_units_1st_sem_(approved)',
       'curricular_units_1st_sem_(grade)', 'tuition_fees_up_to_date',
       'scholarship_holder', 'age_at_enrollment', 'debtor', 'gender',
       'application_mode', 'curricular_units_2nd_sem_(enrolled)',
       'curricular_units_1st_sem_(enrolled)', 'displaced', 'target']

df = df[features]

In [4]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=13)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=13)

In [5]:
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [6]:
y_full_train = df_full_train.target.values
y_train = df_train.target.values
y_val = df_val.target.values
y_test = df_test.target.values

In [7]:
# Drop column 'target'
df_full_train.drop('target', axis=1, inplace=True)
df_train.drop('target', axis=1, inplace=True)
df_val.drop('target', axis=1, inplace=True)
df_test.drop('target', axis=1, inplace=True)

In [8]:
# scaling features
scaler = StandardScaler()

df_full_train_scaled = scaler.fit_transform(df_full_train)
df_train_scaled = scaler.fit_transform(df_train)
df_val_scaled = scaler.transform(df_val)  # Only transform, don't fit!
df_test_scaled = scaler.transform(df_test)  # Only transform, don't fit!

In [9]:
#  handling class imbalance - Use class weights
class_weights = compute_class_weight('balanced', 
                                   classes=np.unique(y_train), 
                                   y=y_train)
weight_dict = dict(zip(np.unique(y_train), class_weights))

### Train without cross-validation

In [10]:
# random forest - Train model without cross-validation
model_rf = RandomForestClassifier(n_estimators=200,
                                          max_depth=10,
                                          min_samples_leaf=3,
                                          class_weight=weight_dict,
                                          n_jobs=-1, 
                                          random_state=13)
# Fit the model
model_rf.fit(df_train_scaled, y_train)

# Get predictions and calculate ROC-AUC
y_pred_proba = model_rf.predict_proba(df_val_scaled)

# Handle both binary and multi-class cases
if len(np.unique(y_train)) == 2:
    # Binary classification
    y_pred_proba = y_pred_proba[:, 1]  # Only need probability of positive class
    roc_auc = roc_auc_score(y_val, y_pred_proba)
else:
    # Multi-class classification
    roc_auc = roc_auc_score(y_val, y_pred_proba, multi_class='ovr', average='macro')

print(f"\nROC-AUC score: {roc_auc:.4f}")


ROC-AUC score: 0.8661


### Train with cross-validation

In [11]:
# Train with cross-validation
def custom_roc_auc(y_true, y_pred_proba):
    """Custom ROC-AUC scorer that handles both binary and multi-class cases"""
    if y_pred_proba.ndim == 1:
        return roc_auc_score(y_true, y_pred_proba)
    elif y_pred_proba.shape[1] == 2:
        return roc_auc_score(y_true, y_pred_proba[:, 1])
    else:
        return roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro')

# Create a custom scorer
roc_auc_scorer = make_scorer(custom_roc_auc, response_method='predict_proba')

# Perform cross-validation
try:
    scores = cross_val_score(
        model_rf, 
        df_train_scaled, 
        y_train, 
        cv=5, 
        scoring=roc_auc_scorer,
        n_jobs=-1
    )
    
    # Print results with confidence interval
    print("\nCross-validation results:")
    print(f"Mean ROC-AUC: {scores.mean():.4f} ± {scores.std() * 1.96:.4f}")
    print(f"Individual fold scores: {', '.join([f'{score:.4f}' for score in scores])}")
    
except Exception as e:
    print(f"Error during cross-validation: {str(e)}")


Cross-validation results:
Mean ROC-AUC: 0.8763 ± 0.0193
Individual fold scores: 0.8813, 0.8819, 0.8567, 0.8825, 0.8792


# Train final model using train + val

In [12]:
# random forest - Train final model using train + val
model_rf = RandomForestClassifier(n_estimators=200,
                                          max_depth=10,
                                          min_samples_leaf=3,
                                          class_weight=weight_dict,
                                          n_jobs=-1, 
                                          random_state=13)
# Fit the model
model_rf.fit(df_full_train_scaled, y_full_train)

# Get predictions and calculate ROC-AUC
y_pred_proba = model_rf.predict_proba(df_test_scaled)

# Handle both binary and multi-class cases
if len(np.unique(y_train)) == 2:
    # Binary classification
    y_pred_proba = y_pred_proba[:, 1]  # Only need probability of positive class
    roc_auc = roc_auc_score(y_test, y_pred_proba)
else:
    # Multi-class classification
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')

print(f"\nROC-AUC score: {roc_auc:.4f}")


ROC-AUC score: 0.8843


### Save model + scaler

In [13]:
output_file = f"model_rf.bin"

In [14]:
output_file

'model_rf.bin'

In [15]:
with open(output_file, 'wb') as f_out: 
    pickle.dump((scaler, model_rf), f_out)

### Load and use model 

In [16]:
input_file = 'model_rf.bin'

In [17]:
with open(input_file, 'rb') as f_in: 
    scaler, model_rf = pickle.load(f_in)

In [18]:
model_rf

In [19]:
# Test with 1 student
student_data = {
    'curricular_units_2nd_sem_(approved)': [8],
    'curricular_units_2nd_sem_(grade)': [14.07125],
    'curricular_units_1st_sem_(approved)': [8],
    'curricular_units_1st_sem_(grade)': [14.07125],
    'tuition_fees_up_to_date': [1],
    'scholarship_holder': [1],
    'age_at_enrollment': [19],
    'debtor': [0],
    'gender': [0],
    'application_mode': [1],
    'curricular_units_2nd_sem_(enrolled)': [8],
    'curricular_units_1st_sem_(enrolled)': [8],
    'displaced': [1]
}

# Create DataFrame
student = pd.DataFrame(student_data)

In [20]:
student.T

Unnamed: 0,0
curricular_units_2nd_sem_(approved),8.0
curricular_units_2nd_sem_(grade),14.07125
curricular_units_1st_sem_(approved),8.0
curricular_units_1st_sem_(grade),14.07125
tuition_fees_up_to_date,1.0
scholarship_holder,1.0
age_at_enrollment,19.0
debtor,0.0
gender,0.0
application_mode,1.0


In [21]:
X = scaler.transform(student)

In [22]:
X

array([[ 1.18032738,  0.73762571,  1.07331942,  0.70378688,  0.37421166,
         1.76125219, -0.57179102, -0.35766157, -0.75034346, -1.09714697,
         0.80134413,  0.70410786,  0.91192544]])

In [23]:
# Get predictions
y_pred = model_rf.predict(X)
y_pred_proba = model_rf.predict_proba(X)

In [24]:
y_pred

array([2])

In [25]:
y_pred_proba

array([[0.02719054, 0.09465016, 0.8781593 ]])

In [26]:
# Define the mapping
mapping = {0: 'Dropout', 1: 'Enrolled', 2: 'Graduate'}

In [27]:
# Map predictions to labels
y_pred_labels = [mapping[pred] for pred in y_pred]

In [28]:
# Print predictions and their probabilities
print("input:", student_data)
print("-" * 50)

for i, (label, proba) in enumerate(zip(y_pred_labels, y_pred_proba)):
    print(f"Student {i+1}: Prediction = {label}, Probabilities = {proba}")
    print("-" * 50)

input: {'curricular_units_2nd_sem_(approved)': [8], 'curricular_units_2nd_sem_(grade)': [14.07125], 'curricular_units_1st_sem_(approved)': [8], 'curricular_units_1st_sem_(grade)': [14.07125], 'tuition_fees_up_to_date': [1], 'scholarship_holder': [1], 'age_at_enrollment': [19], 'debtor': [0], 'gender': [0], 'application_mode': [1], 'curricular_units_2nd_sem_(enrolled)': [8], 'curricular_units_1st_sem_(enrolled)': [8], 'displaced': [1]}
--------------------------------------------------
Student 1: Prediction = Graduate, Probabilities = [0.02719054 0.09465016 0.8781593 ]
--------------------------------------------------
