### PRE-PROCESSING AND FEATURE EXTRACTION 

In [106]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns



In [107]:
df = pd.read_csv('/Users/shaguntembhurne/all_github/student-dropout-prediction/dataset (1).csv')

In [108]:
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [109]:
df['Age at enrollment'].describe()

count    4424.000000
mean       23.265145
std         7.587816
min        17.000000
25%        19.000000
50%        20.000000
75%        25.000000
max        70.000000
Name: Age at enrollment, dtype: float64

In [110]:

# --- STEP 1: THE FULL TRANSFORMATION SCRIPT ---

# 1a. Load the raw data
df = pd.read_csv('dataset (1).csv')

# 1b. Create our 4 engineered features
# Success Rate
df['sem1_pass_rate'] = df['Curricular units 1st sem (approved)'] / df['Curricular units 1st sem (enrolled)']
df['sem1_pass_rate'].fillna(0, inplace=True)
# Completion Rate
df['sem1_eval_completion_rate'] = df['Curricular units 1st sem (evaluations)'] / df['Curricular units 1st sem (enrolled)']
df['sem1_eval_completion_rate'].fillna(0, inplace=True)
# Maturity
df['is_mature_student'] = (df['Age at enrollment'] > 25).astype(int)
# Financial Strain
df['financial_strain'] = ((df['Debtor'] == 1) & (df['Tuition fees up to date'] == 0)).astype(int)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sem1_pass_rate'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sem1_eval_completion_rate'].fillna(0, inplace=True)


### PROCESSING AND TRANSFORMATION

In [111]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 1b. Create our 4 engineered features
# Success Rate
df['sem1_pass_rate'] = df['Curricular units 1st sem (approved)'] / df['Curricular units 1st sem (enrolled)']
df['sem1_pass_rate'].fillna(0, inplace=True)
# Completion Rate
df['sem1_eval_completion_rate'] = df['Curricular units 1st sem (evaluations)'] / df['Curricular units 1st sem (enrolled)']
df['sem1_eval_completion_rate'].fillna(0, inplace=True)
# Maturity
df['is_mature_student'] = (df['Age at enrollment'] > 25).astype(int)
# Financial Strain
df['financial_strain'] = ((df['Debtor'] == 1) & (df['Tuition fees up to date'] == 0)).astype(int)

# 1c. Identify final categorical and numerical columns
categorical_cols = [
    'Marital status', 'Application mode', 'Course', 'Daytime/evening attendance',
    'Previous qualification', 'Nacionality', "Mother's qualification", "Father's qualification",
    "Mother's occupation", "Father's occupation", 'Displaced', 'Educational special needs',
    'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International'
]

numerical_cols = [
    'Application order', 'Age at enrollment', 'Curricular units 1st sem (credited)',
    'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)',
    'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)',
    'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)',
    'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)',
    'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)',
    'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate', 'GDP',
    # We must include our new numerical features here to be scaled!
    'sem1_pass_rate', 'sem1_eval_completion_rate', 'is_mature_student', 'financial_strain'
]

# 1d. One-hot encode the categorical columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# 1e. Scale the numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


# --- STEP 2: ENCODE THE TARGET VARIABLE ---
target_map = {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}
df['Target'] = df['Target'].map(target_map)


# --- STEP 3: SEPARATE FEATURES (X) AND TARGET (y) ---
X = df.drop('Target', axis=1)
y = df['Target']


# --- STEP 4: CREATE TRAINING AND TESTING SETS ---
# We'll use 80% of the data for training and 20% for testing.
# stratify=y makes sure the proportion of dropouts/graduates is the same in both sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- Final Verification ---
print("Preprocessing Complete!")
print("Shape of our training features (X_train):", X_train.shape)
print("Shape of our testing features (X_test):", X_test.shape)
print("Shape of our training target (y_train):", y_train.shape)
print("Shape of our testing target (y_test):", y_test.shape)

Preprocessing Complete!
Shape of our training features (X_train): (3539, 240)
Shape of our testing features (X_test): (885, 240)
Shape of our training target (y_train): (3539,)
Shape of our testing target (y_test): (885,)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sem1_pass_rate'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sem1_eval_completion_rate'].fillna(0, inplace=True)


In [113]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.xgboost

# --- Step 1: Set up MLflow ---
# This creates a new "Experiment" in MLflow to keep our runs organized
mlflow.set_experiment("Student Dropout Prediction")

# This starts a new "Run". Everything we do inside this block will be logged.
with mlflow.start_run():

    # --- Step 2: Define and Train the XGBoost Model ---
    # We use XGBClassifier because this is a classification problem.
    # objective='multi:softmax' tells the model we have more than 2 classes.
    # num_class=3 tells it we have 3 outcomes (Dropout, Enrolled, Graduate).
    model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        random_state=42
    )

    print("Training the XGBoost model...")
    # The model learns from the training data
    model.fit(X_train, y_train)
    print("Training complete.")


    # --- Step 3: Evaluate the Model's Performance ---
    # The model makes predictions on the test data it has never seen before
    y_pred = model.predict(X_test)

    # Calculate the performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    # 'weighted' average is good for multi-class problems as it accounts for imbalance
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Print the results so you can see them immediately
    print(f"\n--- Model Performance ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")


    # --- Step 4: Log Everything to MLflow ---
    print("\nLogging run to MLflow...")
    # Log the parameters (settings) we used for our model
    mlflow.log_param("model_type", "XGBClassifier")
    mlflow.log_param("objective", "multi:softmax")

    # Log the metrics (results) we calculated
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision_weighted", precision)
    mlflow.log_metric("recall_weighted", recall)
    mlflow.log_metric("f1_weighted", f1)

    # This is the most important part: save the actual trained model
    mlflow.xgboost.log_model(model, "xgboost-model")
    print("Run logged successfully!")

Training the XGBoost model...




Training complete.

--- Model Performance ---
Accuracy: 0.7650
Precision: 0.7523
Recall: 0.7650
F1-Score: 0.7562

Logging run to MLflow...


  self.get_booster().save_model(fname)


Run logged successfully!


In [114]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
import mlflow
import mlflow.xgboost

# --- Step 1: Set up the MLflow Experiment ---
mlflow.set_experiment("Student Dropout Prediction")

# --- Step 2: Define the "Grid" of Hyperparameters to Test ---
# We'll try 2 different values for each of our 3 chosen parameters.
# This will result in 2 * 2 * 2 = 8 total experiments.
learning_rates = [0.1, 0.3]
max_depths = [3, 6]
n_estimators_list = [100, 200]

print("Starting Hyperparameter Tuning...")

# --- Step 3: Loop Through Every Combination ---
for lr in learning_rates:
    for md in max_depths:
        for ne in n_estimators_list:
            
            # This starts a new "Run" for each unique combination
            with mlflow.start_run():
                
                # --- A. Log the Parameters for This Run ---
                mlflow.log_param("learning_rate", lr)
                mlflow.log_param("max_depth", md)
                mlflow.log_param("n_estimators", ne)
                
                # Print to the notebook so we can see our progress
                print(f"\n--- Training with lr={lr}, max_depth={md}, n_estimators={ne} ---")

                # --- B. Define and Train the Model with the Current Parameters ---
                model = xgb.XGBClassifier(
                    objective='multi:softmax',
                    num_class=3,
                    learning_rate=lr,
                    max_depth=md,
                    n_estimators=ne,
                    random_state=42
                )
                model.fit(X_train, y_train)

                # --- C. Evaluate the Model ---
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                print(f"Accuracy: {accuracy:.4f}")

                # --- D. Log the Resulting Metric and Model ---
                mlflow.log_metric("accuracy", accuracy)
                mlflow.xgboost.log_model(model, "xgboost-model")

print("\nHyperparameter Tuning Complete!")

Starting Hyperparameter Tuning...

--- Training with lr=0.1, max_depth=3, n_estimators=100 ---




Accuracy: 0.7638


  self.get_booster().save_model(fname)



--- Training with lr=0.1, max_depth=3, n_estimators=200 ---




Accuracy: 0.7638


  self.get_booster().save_model(fname)



--- Training with lr=0.1, max_depth=6, n_estimators=100 ---




Accuracy: 0.7548


  self.get_booster().save_model(fname)



--- Training with lr=0.1, max_depth=6, n_estimators=200 ---




Accuracy: 0.7616


  self.get_booster().save_model(fname)



--- Training with lr=0.3, max_depth=3, n_estimators=100 ---




Accuracy: 0.7661


  self.get_booster().save_model(fname)



--- Training with lr=0.3, max_depth=3, n_estimators=200 ---




Accuracy: 0.7672


  self.get_booster().save_model(fname)



--- Training with lr=0.3, max_depth=6, n_estimators=100 ---




Accuracy: 0.7650


  self.get_booster().save_model(fname)



--- Training with lr=0.3, max_depth=6, n_estimators=200 ---




Accuracy: 0.7650


  self.get_booster().save_model(fname)



Hyperparameter Tuning Complete!


In [115]:
from joblib import dump

# Assuming 'scaler' is your fitted StandardScaler object from preprocessing
dump(scaler, 'scaler.joblib') 

# Assuming 'X_train' is your final, preprocessed training DataFrame
dump(X_train.columns, 'model_columns.joblib')

print("Scaler and model columns saved successfully!")

Scaler and model columns saved successfully!
