In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.metrics import RocCurveDisplay
import kds

from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
import sys
import ads

# Student dataset

The student dataset is focused on the **education domain**, specifically modeling and predicting academic success. The classification task is to determine whether a first-year college student will pass or fail based on their performance in secondary school and personal learning attributes. The dataset contains **1131 entries** and uses a binary class label, **Pass**, with values of either "Pass" or "Fail."  

The attributes in the dataset are all numeric and represent a mix of academic performance metrics and non-cognitive learning traits. Academic metrics include scores in Mathematics, English, and **CAO Points**, a measure of performance in Ireland's state exams. Non-cognitive traits capture different learning preferences, such as whether a student learns better through listening (**Auditory**), doing (**Kinaesthetic**), or visual aids (**Visual**). Additionally, motivational aspects are reflected through **extrinsic motivation** (driven by external rewards) and **Intrinsic Motivation** (interest in learning). Attributes like **Self-Efficacy** and **Conscientiousness** provide insights into a student's personality and confidence in their abilities, while **Study Time** reflects weekly study habits.  

This dataset allows for the exploration of how various academic and personal traits influence **a student's likelihood of passing**, highlighting the complex interplay between cognitive skills, personality traits, and study behaviors in academic success.


Unlike the adult dataset, the student dataset contains no categorical data. Additionally, all columns have **1131 non-null values**, indicating there are no missing entries. This is advantageous for analysis as it removes the need for data imputation or addressing missing values.

In [None]:
# Load the dataset
student = pd.read_csv('data/student.csv', sep=';')

# Verify the first few rows
student.head()


In [None]:
student.info()

## Handle Missing Values

In [None]:
# Fill missing values for categorical columns with mode
student = student.copy()
for col in student.select_dtypes(include=['object']).columns:
    student[col] = student[col].fillna(student[col].mode()[0])

# Fill missing values for numerical columns with mean or median
for col in student.select_dtypes(include=['float64', 'int64']).columns:
    student[col] = student[col].fillna(student[col].mean())
# Check for missing values after handling
print("\nMissing values after handling:")
print(student.isnull().sum())

# Display the first 5 rows of the updated dataset
print("\nPreview of updated dataset:")
print(student.head())


The dataset is ready for preprocessing steps such as encoding categorical variables (if needed), scaling numerical attributes, and splitting into training and testing sets for further modeling.

## Encode categorical variables

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
student = pd.read_csv('data/student.csv', sep=';')

# Create a copy of the student dataset to ensure original remains unchanged
student_encoded = student.copy()

# Label Encoding for the binary 'Pass' column
le = LabelEncoder()
student_encoded['Pass'] = le.fit_transform(student_encoded['Pass'])  # Pass -> 1, Fail -> 0

# Check for other categorical columns
categorical_cols = student_encoded.select_dtypes(include=['object']).columns

# If there are additional categorical columns, apply one-hot encoding
student_encoded = pd.get_dummies(student_encoded, columns=categorical_cols, drop_first=True)

# Preview the dataset after encoding
print("Encoded dataset preview:")
print(student_encoded.head())


The dataset is now ready for modeling, as all categorical variables have been encoded, and missing values (if any) have been handled. The Pass column is now encoded as a numerical target variable (1 for "Pass", 0 for "Fail"), making the dataset ready for training and testing machine learning models.

## Scale numerical features

In [None]:
from sklearn.preprocessing import StandardScaler

# Instantiate the scaler
scaler = StandardScaler()

# Scale the feature columns
student_encoded_scaled = scaler.fit_transform(student_encoded.drop(columns=['Pass']))

# Convert the scaled features back into a DataFrame
student_encoded_scaled = pd.DataFrame(student_encoded_scaled, columns=student_encoded.drop(columns=['Pass']).columns)
student_encoded_scaled['Pass'] = student_encoded['Pass']


## Train test split

In [None]:



# Separate features (X) and target (y)
X = student.drop('Pass', axis=1)
y = student['Pass']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    stratify=y, 
    random_state=43
)

# Output the lengths to ensure the split is correct
print(len(X_train), len(y_train))  # Training set sizes
print(len(X_test), len(y_test))    # Testing set sizes

# Initialize the Decision Tree classifier
dt_clf = DecisionTreeClassifier(random_state=43)

# Train the classifier with the training data
dt_clf.fit(X_train, y_train)

# Use the trained model to predict the classes of the test data
predictions = dt_clf.predict(X_test)

# Compute and output the accuracy by comparing the actual test labels with the predicted labels
print("Accuracy:", accuracy_score(y_test, predictions))


The output indicates that the model has successfully split the data into training and testing sets, with 791 samples for training and 340 for testing. The accuracy of 100% suggests that the Decision Tree classifier perfectly predicted the test labels. While this is an excellent result, it may also indicate overfitting, where the model has memorized the training data rather than generalizing from it. It's important to further assess the modelâ€™s performance using techniques like cross-validation to ensure it performs well on unseen data and isn't overly complex.

In [None]:
# Import necessary function
from sklearn.metrics import confusion_matrix

# Get and output the confusion matrix
confusion_m = confusion_matrix(y_test, predictions)

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_m)


his confusion matrix indicates a relatively high number of True Positives (172), but also a considerable number of False Positives (43) and False Negatives (42), suggesting that while the model is effective, there are some misclassifications that could be addressed through further tuning or different modeling approaches.

In [None]:
# Import the necessary libraries
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate the confusion matrix
confusion_m = confusion_matrix(y_test, predictions)

# Display the confusion matrix using the ConfusionMatrixDisplay class
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_m, display_labels=["Fail", "Pass"])
disp.plot(cmap=plt.cm.Blues)

# Display the plot
plt.title('Confusion Matrix')
plt.show()


In [None]:
# also output other metrics (by class and overall scores)
print(classification_report(y_test, predictions))



The model performs better in predicting the "Pass" class with a precision, recall, and F1-score of 0.80, compared to the "Fail" class with scores of 0.66. Overall accuracy is 75%, indicating the model correctly classified 75% of the test samples. The macro average of 0.73 and weighted average of 0.75 reflect a balanced performance across both classes, though there's room for improvement, especially for the "Fail" class.

the model is better at predicting the "Pass" class but struggles more with predicting the "Fail" class. Improving recall for the "Fail" class and increasing overall precision would help enhance performance.


- The model performs well for the "Pass" class but has room for improvement in predicting the "Fail" class.
- The overall accuracy and weighted averages suggest that the model is performing reasonably well, but further tuning or additional techniques may be needed to improve predictions for the "Fail" class.


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, roc_auc_score, classification_report
from scipy.stats import loguniform
import numpy as np

# Split the data
X = student.drop('Pass', axis=1)
y = student['Pass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define the model (SVM with RBF kernel)
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True)

# Train the model
svm_model.fit(X_train, y_train)

# Predictions
predictions = svm_model.predict(X_test)

# Evaluate the model
print("F1-Score:", f1_score(y_test, predictions, pos_label='Pass'))
print("AUC:", roc_auc_score(y_test, svm_model.predict_proba(X_test)[:, 1]))
print(classification_report(y_test, predictions))

# Adjust hyperparameters, RandomizedSearchCV (FAST VERSION)
# Reduced n_iter to 10 and simplified distributions for speed
param_distributions = {
    'C': loguniform(1e-1, 1e1),      # Search C between 0.1 and 10
    'kernel': ['linear', 'rbf'],         # Only test these two
    'gamma': ['scale', 'auto']           # Only test these two
}

# n_jobs=-1 uses all cores
# n_iter=10 is much faster (tests only 10 random combinations)
random_search = RandomizedSearchCV(
    SVC(probability=True, random_state=42), 
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=5, 
    scoring='f1', 
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best parameters:", random_search.best_params_)
print("Best F1 Score:", random_search.best_score_)

SVM and k-NN are suited for the student dataset due to its numerical features like "Auditory," "Study Time," and "Motivation," which influence the binary target, "Pass." SVM handles high-dimensional data and complex relationships, while k-NN leverages proximity in feature space, making them ideal for capturing patterns in student performance.

## SVM

In [None]:
from sklearn.svm import SVC

# Instantiate the SVM classifier
svm_clf = SVC(kernel='rbf', random_state=42)

# Train the model on the training data
svm_clf.fit(X_train, y_train)


## Evaluate Performance

In [None]:
from sklearn.svm import SVC

# Define the SVM model
svc_model = SVC(random_state=42)

# Train the SVM Model
svc_model.fit(X_train, y_train)

# Predictions
predictions = svc_model.predict(X_test)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))


The SVM model is heavily biased toward the "Pass" class and fails to correctly identify any "Fail" instances. This suggests that the model struggles with class imbalance or the decision boundary is not well-suited to separate the two classes effectively. Further tuning (e.g., balancing the dataset, adjusting hyperparameters, or testing a different kernel) is needed to improve performance.

## Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Assuming X_train and y_train have been previously defined.

# Set up the grid search with different hyperparameters
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5)

# Fit the grid search model to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found during the grid search
print("Best Parameters:", grid_search.best_params_)

# Print all results for the parameter grid search
print("All Results:")
print(grid_search.cv_results_)





For C=0.1 and kernel='linear', the mean test score is 0.68648993
For C=1 and kernel='linear', the mean test score is 0.68901361
For C=10 and kernel='linear', the mean test score is 0.68771595
For C=0.1 and kernel='rbf', the mean test score is 0.62831781
For C=1 and kernel='rbf', the mean test score is 0.63717857
For C=10 and kernel='rbf', the mean test score is 0.62831781
The best-performing model according to GridSearchCV is the SVM with a linear kernel and a regularization parameter of C=1. This combination produced the highest mean test score during the cross-validation

## kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the k-NN classifier
knn_clf = KNeighborsClassifier(n_neighbors=5)

# Train the model on the training data
knn_clf.fit(X_train, y_train)


In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3, metric='euclidean')


## Cross Validation

In [None]:
# Get predictions using 5-fold cross-validation
predictions = cross_val_predict(dt_clf, X, y, cv=5)

# Output classification report
print(classification_report(y, predictions))

# Generate and plot the confusion matrix
confusion_m = confusion_matrix(y, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_m, display_labels=["Fail", "Pass"])
disp.plot(cmap=plt.cm.Blues)

# Display the plot
plt.title('Confusion Matrix')
plt.show()


In [None]:

confusion_m = confusion_matrix(y, predictions)

# Get predictions using 10-fold cross-validation
predictions = cross_val_predict(dt_clf, X, y, cv=10)

print(classification_report(y, predictions))

# Generate and plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_m, display_labels=["Fail", "Pass"])
disp.plot(cmap=plt.cm.Blues)

# Display the plot
plt.title('Confusion Matrix')
plt.show()



In [None]:
# Import necessary libraries
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, RocCurveDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt

# ==========================================
# 1. Preprocessing (Required for ROC)
# ==========================================

# Encode target variable to numeric (Pass=1, Fail=0)
# RocCurveDisplay requires numeric labels, not strings.
le = LabelEncoder()
student['Pass'] = le.fit_transform(student['Pass'])

# Step 1: Prepare the data
X = student.drop('Pass', axis=1)  # Features (all columns except 'Pass')
y = student['Pass']  # Target variable

# Step 2: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the data
# Standardizing features is critical for SVM and beneficial for k-NN.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ==========================================
# 2. Cross-Validation (Your Code)
# ==========================================

# Step 3: Define the models
knn_model = KNeighborsClassifier(n_neighbors=5)  # Default k-NN model
svm_model = SVC(kernel='linear', C=1, probability=True, random_state=42)  # Optimized SVM based on your previous analysis

# Step 4: Evaluate models using cross-validation
knn_scores = cross_val_score(knn_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
svm_scores = cross_val_score(svm_model, X_train_scaled, y_train, cv=5, scoring='accuracy')

# Step 5: Compare the models
print("KNN Cross-Validation Accuracy Scores:", knn_scores)
print("Mean Accuracy for KNN:", knn_scores.mean())
print("SVM Cross-Validation Accuracy Scores:", svm_scores)
print("Mean Accuracy for SVM:", svm_scores.mean())

# Step 6: Use cross-validation predictions to evaluate in more detail
knn_predictions = cross_val_predict(knn_model, X_train_scaled, y_train, cv=5)
svm_predictions = cross_val_predict(svm_model, X_train_scaled, y_train, cv=5)

# Print classification reports
print("\nKNN Classification Report:")
print(classification_report(y_train, knn_predictions))

print("SVM Classification Report:")
print(classification_report(y_train, svm_predictions))

# Optionally, compare confusion matrices
print("KNN Confusion Matrix:")
print(confusion_matrix(y_train, knn_predictions))

print("SVM Confusion Matrix:")
print(confusion_matrix(y_train, svm_predictions))

# ==========================================
# 3. Redefined ROC Curve
# ==========================================

# Fit the models on the full training set to generate the ROC
# Note: We use the optimized parameters (kernel='linear', C=1) found previously for SVM
svm_final = SVC(kernel='linear', C=1, probability=True, random_state=42)
svm_final.fit(X_train_scaled, y_train)

knn_final = KNeighborsClassifier(n_neighbors=5)
knn_final.fit(X_train_scaled, y_train)

# Generate ROC Curve for SVM
svm_disp = RocCurveDisplay.from_estimator(
    svm_final, 
    X_train_scaled, 
    y_train
)

# Generate ROC Curve for k-NN on the same axis
knn_disp = RocCurveDisplay.from_estimator(
    knn_final, 
    X_train_scaled, 
    y_train,
    ax=svm_disp.ax_
)

knn_disp.figure_.suptitle("ROC Curve Comparison: Optimized SVM vs k-NN")
plt.plot([0, 1], [0, 1], 'k--')
plt.show()

## Comparison


KNN performs better overall compared to SVM. Although both models have similar accuracies, KNN is better at handling both classes, especially "Fail." It has more balanced precision and recall.
SVM, on the other hand, struggles with predicting "Fail" and is biased toward predicting "Pass," leading to a high recall for "Pass" but zero precision and recall for "Fail." This can be problematic when both classes are important.
