In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from preprocess import TERMS, get_term


all_students = []
for term in TERMS:
    students = get_term(term)
    all_students.extend(students)

X = np.array([(s["sex"], s["years_enrolled"], len(s["current_courses"]),
               s["points_quiz"], s["points_assign"], s["points_checkmark"],
               s["points_unknown"], s["points_total"]) for s in all_students])
y = np.array([int(s["passed"]) for s in all_students])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = [1, 2, 3, 4, 5, 6, 7]
categorical_features = [0]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

def evaluate_model(y_true, y_pred, y_prob):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
        'ROC AUC': roc_auc_score(y_true, y_prob)
    }

results = {}

for name, classifier in classifiers.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]
    
    results[name] = evaluate_model(y_test, y_pred, y_prob)

for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")



Logistic Regression:
Accuracy: 0.8933
Precision: 0.8940
Recall: 0.9532
F1 Score: 0.9227
ROC AUC: 0.9399

Decision Tree:
Accuracy: 0.9566
Precision: 0.9724
Recall: 0.9623
F1 Score: 0.9674
ROC AUC: 0.9557

Random Forest:
Accuracy: 0.9601
Precision: 0.9653
Recall: 0.9753
F1 Score: 0.9703
ROC AUC: 0.9903

SVM:
Accuracy: 0.9306
Precision: 0.9367
Recall: 0.9610
F1 Score: 0.9487
ROC AUC: 0.9787

Naive Bayes:
Accuracy: 0.8656
Precision: 0.8907
Recall: 0.9104
F1 Score: 0.9004
ROC AUC: 0.9021

K-Nearest Neighbors:
Accuracy: 0.9237
Precision: 0.9263
Recall: 0.9623
F1 Score: 0.9439
ROC AUC: 0.9625
