In [None]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [None]:
data = pd.read_csv("data/student-social-media-academic-performance.csv")
data.head()

Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted,Usage_to_Sleep_Ratio
0,1,23,Male,Undergraduate,Canada,7.44,WhatsApp,Yes,6.97,5.3,In Relationship,2,1,1.067432
1,2,20,Female,Postgraduate,France,5.89,WhatsApp,Yes,6.42,5.6,Single,1,0,0.917445
2,3,24,Male,Undergraduate,South Africa,6.86,Twitter,No,7.12,8.5,Single,3,0,0.963483
3,4,21,Male,Undergraduate,Spain,8.94,Twitter,Yes,8.56,4.4,In Relationship,0,1,1.044393
4,5,23,Female,High School,South Africa,6.63,WhatsApp,Yes,6.97,8.6,Single,3,1,0.95122


In [None]:
def preprocess_data(
    data,
    target_col="Affects_Academic_Performance",
    id_col="Student_ID",
    test_size=0.1,
    random_state=42,
    positive_label="Yes",
    negative_label="No",
):
    data = data.drop(id_col, axis=1, errors='ignore')
    data["Usage_to_Sleep_Ratio"] = (
        data["Avg_Daily_Usage_Hours"] / data["Sleep_Hours_Per_Night"])

    # Separate features and target
    X = data.drop(target_col, axis=1)
    y = data[target_col].map({positive_label: 1, negative_label: 0})

    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Identify categorical and numerical columns
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Encode categorical features using OneHotEncoder
    encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
    X_train_cat = encoder.fit_transform(X_train_raw[categorical_cols])
    X_test_cat = encoder.transform(X_test_raw[categorical_cols])

    train_cat_df = pd.DataFrame(
        X_train_cat,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=X_train_raw.index
    )
    test_cat_df = pd.DataFrame(
        X_test_cat,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=X_test_raw.index
    )

    X_train_num = X_train_raw[numerical_cols]
    X_test_num = X_test_raw[numerical_cols]

    X_train = pd.concat([X_train_num, train_cat_df], axis=1)
    X_test = pd.concat([X_test_num, test_cat_df], axis=1)

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return (
        X_train_scaled,
        X_test_scaled,
        y_train,
        y_test,
        encoder,
        scaler,
        categorical_cols,
        numerical_cols
    )

In [None]:
(
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test,
    encoder,
    scaler,
    categorical_cols,
    numerical_cols,
) = preprocess_data(data)


In [None]:
models = {
    "logistic_regression": LogisticRegression(class_weight="balanced"),
    "decision_tree": DecisionTreeClassifier(random_state=42),
    "knn": KNeighborsClassifier(n_neighbors=5),
    "naive_bayes": GaussianNB(),
    "random_forest": RandomForestClassifier(n_estimators=150, random_state=42),
    "xgboost": XGBClassifier(
        n_estimators=150,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=42
    )
}


In [None]:
results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    # Save model
    joblib.dump(model, f"saved_models/{name}.pkl")

    results.append([
        name,
        accuracy_score(y_test, y_pred),
        roc_auc_score(y_test, y_prob),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        matthews_corrcoef(y_test, y_pred)
    ])

results_df = pd.DataFrame(results,
    columns=["Model","Accuracy","AUC","Precision","Recall","F1","MCC"]
)

results_df

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,logistic,0.855,0.854491,0.822222,0.850575,0.836158,0.706507
1,decision_tree,0.77,0.772658,0.71134,0.793103,0.75,0.540932
2,knn,0.745,0.724087,0.790323,0.563218,0.657718,0.480408
3,naive_bayes,0.775,0.741379,1.0,0.482759,0.651163,0.587592
4,random_forest,0.865,0.855406,0.894737,0.781609,0.834356,0.726
5,xgboost,0.84,0.838572,0.808989,0.827586,0.818182,0.675497
