<a href="https://colab.research.google.com/github/samipn/crisp-dm_semma_and_kdd/blob/main/SEMMA_Student_Performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SEMMA: Student Performance â€” Predict final grade / pass-fail

In [1]:
#@title Setup
!pip -q install imbalanced-learn fastapi uvicorn joblib plotly
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, os, joblib
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
RANDOM_STATE=42
os.makedirs('data', exist_ok=True)


## Sample
Create stratified train/validate/test.

In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 68 bytes


In [3]:
# Setup Kaggle API
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
# Download the dataset
!kaggle datasets download -d devansodariya/student-performance-data -p data --unzip

Dataset URL: https://www.kaggle.com/datasets/devansodariya/student-performance-data
License(s): CC0-1.0
Downloading student-performance-data.zip to data
  0% 0.00/7.16k [00:00<?, ?B/s]
100% 7.16k/7.16k [00:00<00:00, 47.6MB/s]


In [8]:
csv_path = 'data/student_data.csv' # Corrected path to the CSV file
if not os.path.exists(csv_path):
    print(f"Expected CSV file not found at {csv_path}. Please check the contents of the data folder.")
else:
    df = pd.read_csv(csv_path)

# If target is categorical like 'pass_fail' create it
if 'pass_fail' not in df.columns and 'G3' in df.columns:
    df['pass_fail'] = (df['G3'] >= 10).astype(int)
target = 'pass_fail' if 'pass_fail' in df.columns else df.columns[-1]
y = df[target]
X = df.drop(columns=[target])
X_tr, X_tmp, y_tr, y_tmp = train_test_split(X,y,test_size=0.4, stratify=y, random_state=RANDOM_STATE)
X_va, X_te, y_va, y_te = train_test_split(X_tmp,y_tmp,test_size=0.5, stratify=y_tmp, random_state=RANDOM_STATE)
print("Shapes:", X_tr.shape, X_va.shape, X_te.shape)

Shapes: (237, 33) (79, 33) (79, 33)


## Explore
Univariate + group analysis.

In [9]:
df.describe(include='all').T.head(15)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
school,395.0,2.0,GP,349.0,,,,,,,
sex,395.0,2.0,F,208.0,,,,,,,
age,395.0,,,,16.696203,1.276043,15.0,16.0,17.0,18.0,22.0
address,395.0,2.0,U,307.0,,,,,,,
famsize,395.0,2.0,GT3,281.0,,,,,,,
Pstatus,395.0,2.0,T,354.0,,,,,,,
Medu,395.0,,,,2.749367,1.094735,0.0,2.0,3.0,4.0,4.0
Fedu,395.0,,,,2.521519,1.088201,0.0,2.0,2.0,3.0,4.0
Mjob,395.0,5.0,other,141.0,,,,,,,
Fjob,395.0,5.0,other,217.0,,,,,,,


## Modify
Impute, scale numeric, one-hot encode categoricals.

In [10]:
numeric = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical = X.select_dtypes(include=['object','category','bool']).columns.tolist()
num_pipe = Pipeline([('imp', SimpleImputer(strategy='median')), ('sc', StandardScaler())])
cat_pipe = Pipeline([('imp', SimpleImputer(strategy='most_frequent')), ('oh', OneHotEncoder(handle_unknown='ignore'))])
pre = ColumnTransformer([('num', num_pipe, numeric), ('cat', cat_pipe, categorical)])

## Model
Gradient Boosting as a strong baseline.

In [11]:
from sklearn.metrics import classification_report
import os

# Create the deployment directory if it doesn't exist
os.makedirs('deployment', exist_ok=True)

clf = GradientBoostingClassifier(random_state=RANDOM_STATE)
pipe = Pipeline([('pre', pre), ('clf', clf)])
pipe.fit(X_tr, y_tr)
pred_va = pipe.predict(X_va)
print(classification_report(y_va, pred_va))
joblib.dump(pipe, 'deployment/model.joblib'); print("Saved to deployment/model.joblib")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        26
           1       1.00      1.00      1.00        53

    accuracy                           1.00        79
   macro avg       1.00      1.00      1.00        79
weighted avg       1.00      1.00      1.00        79

Saved to deployment/model.joblib


## Assess
Evaluate on test.

In [12]:
from sklearn.metrics import confusion_matrix
pred_te = pipe.predict(X_te)
print(classification_report(y_te, pred_te))
print(confusion_matrix(y_te, pred_te))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        26
           1       1.00      1.00      1.00        53

    accuracy                           1.00        79
   macro avg       1.00      1.00      1.00        79
weighted avg       1.00      1.00      1.00        79

[[26  0]
 [ 0 53]]
