## student-dropout-and-success-prediction

### Train and save model

In [1]:
# import required libraries
import numpy as np
import pandas as pd

import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# load data
df = pd.read_csv("../data/dataset.csv")

df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('/', '_')
df.rename(columns={'nacionality':'nationality'}, inplace=True)

df['target'] = df['target'].map({
    'Dropout':0,
    'Enrolled':1,
    'Graduate':2
})

In [3]:
features = ['curricular_units_2nd_sem_(approved)',
       'curricular_units_2nd_sem_(grade)',
       'curricular_units_1st_sem_(approved)',
       'curricular_units_1st_sem_(grade)', 'tuition_fees_up_to_date',
       'scholarship_holder', 'age_at_enrollment', 'debtor', 'gender',
       'application_mode', 'curricular_units_2nd_sem_(enrolled)',
       'curricular_units_1st_sem_(enrolled)', 'displaced', 'target']

df = df[features]

In [4]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=13)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=13)

In [5]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [6]:
y_train = df_train.target.values
y_val = df_val.target.values
y_test = df_test.target.values

In [7]:
# Drop column 'target'
df_train.drop('target', axis=1, inplace=True)
df_val.drop('target', axis=1, inplace=True)
df_test.drop('target', axis=1, inplace=True)

In [8]:
# scaling features
scaler = StandardScaler()

df_train_scaled = scaler.fit_transform(df_train)
df_val_scaled = scaler.transform(df_val)  # Only transform, don't fit!
df_test_scaled = scaler.transform(df_test)  # Only transform, don't fit!

In [9]:
#  handling class imbalance - Use class weights
class_weights = compute_class_weight('balanced', 
                                   classes=np.unique(y_train), 
                                   y=y_train)
weight_dict = dict(zip(np.unique(y_train), class_weights))

In [10]:
# random forest
model_rf = RandomForestClassifier(n_estimators=200,
                                          max_depth=10,
                                          min_samples_leaf=3,
                                          class_weight=weight_dict,
                                          n_jobs=-1, 
                                          random_state=13)
model_rf.fit(df_train_scaled, y_train)

# Get predictions
y_pred_proba = model_rf.predict_proba(df_val_scaled)
roc_auc = roc_auc_score(y_val, y_pred_proba, multi_class='ovr')
print(f"\nROC-AUC score: {roc_auc}")


ROC-AUC score: 0.8661231434439255


In [11]:
output_file = f"model_rf.bin"

In [12]:
output_file

'model_rf.bin'

In [13]:
with open(output_file, 'wb') as f_out: 
    pickle.dump((model_rf), f_out)

### Load and use model 

In [14]:
input_file = 'model_rf.bin'

In [15]:
with open(input_file, 'rb') as f_in: 
    model_rf = pickle.load(f_in)

In [16]:
model_rf

In [17]:
# Test with 1 student
student_data = {
    'curricular_units_2nd_sem_(approved)': [8],
    'curricular_units_2nd_sem_(grade)': [14.07125],
    'curricular_units_1st_sem_(approved)': [8],
    'curricular_units_1st_sem_(grade)': [14.07125],
    'tuition_fees_up_to_date': [1],
    'scholarship_holder': [1],
    'age_at_enrollment': [19],
    'debtor': [0],
    'gender': [0],
    'application_mode': [1],
    'curricular_units_2nd_sem_(enrolled)': [8],
    'curricular_units_1st_sem_(enrolled)': [8],
    'displaced': [1]
}

# Create DataFrame
student = pd.DataFrame(student_data)

In [18]:
student.T

Unnamed: 0,0
curricular_units_2nd_sem_(approved),8.0
curricular_units_2nd_sem_(grade),14.07125
curricular_units_1st_sem_(approved),8.0
curricular_units_1st_sem_(grade),14.07125
tuition_fees_up_to_date,1.0
scholarship_holder,1.0
age_at_enrollment,19.0
debtor,0.0
gender,0.0
application_mode,1.0


In [19]:
X = scaler.transform(student)

In [20]:
X

array([[ 1.18032738,  0.73762571,  1.07331942,  0.70378688,  0.37421166,
         1.76125219, -0.57179102, -0.35766157, -0.75034346, -1.09714697,
         0.80134413,  0.70410786,  0.91192544]])

In [21]:
# Get predictions
y_pred = model_rf.predict(X)
y_pred_proba = model_rf.predict_proba(X)

In [22]:
y_pred

array([2])

In [23]:
y_pred_proba

array([[0.02151529, 0.09247212, 0.88601259]])

In [24]:
# Define the mapping
mapping = {0: 'Dropout', 1: 'Enrolled', 2: 'Graduate'}

In [25]:
# Map predictions to labels
y_pred_labels = [mapping[pred] for pred in y_pred]

In [26]:
# Print predictions and their probabilities
for i, (label, proba) in enumerate(zip(y_pred_labels, y_pred_proba)):
    print(f"Student {i+1}: Prediction = {label}, Probabilities = {proba}")

Student 1: Prediction = Graduate, Probabilities = [0.02151529 0.09247212 0.88601259]
