In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score
import numpy as np

# Load your dataset
df = pd.read_csv('C:/Users/abdulssekyanzi/EDA Dataset.csv/100.csv')

# Create a more complex target variable
df['target'] = (df['MLII'] > df['MLII'].median()).astype(int)  # Use median as threshold

# Features and target
X = df[['time_ms', 'MLII', 'V5']]
y = df['target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Teacher Model (Random Forest) with simpler hyperparameters
param_grid_teacher = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 5]
}

teacher_model = RandomForestClassifier(random_state=42)
grid_search_teacher = GridSearchCV(estimator=teacher_model, param_grid=param_grid_teacher, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_teacher.fit(X_train_scaled, y_train)
best_teacher_model = grid_search_teacher.best_estimator_

# Generate Soft Labels
soft_labels = best_teacher_model.predict_proba(X_train_scaled)

# Convert soft labels to hard labels
hard_labels_student = np.argmax(soft_labels, axis=1)

# Train a Student Model (Decision Tree) with simpler hyperparameters
param_grid_student = {
    'max_depth': [10, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 5]
}

student_model = DecisionTreeClassifier(random_state=42)
grid_search_student = GridSearchCV(estimator=student_model, param_grid=param_grid_student, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_student.fit(X_train_scaled, hard_labels_student)  # Use hard labels here
best_student_model = grid_search_student.best_estimator_

# Evaluate the student model using cross-validation
cv_scores = cross_val_score(best_student_model, X_train_scaled, hard_labels_student, cv=5, scoring='f1')

# Final evaluation on the test set
y_pred_student = best_student_model.predict(X_test_scaled)

accuracy_student = accuracy_score(y_test, y_pred_student)
f1_student = f1_score(y_test, y_pred_student)
roc_auc_student = roc_auc_score(y_test, best_student_model.predict_proba(X_test_scaled)[:, 1])
report_student = classification_report(y_test, y_pred_student)

print(f'Cross-Validation F1 Scores: {cv_scores}')
print(f'Mean Cross-Validation F1 Score: {np.mean(cv_scores)}')
print(f'Student Model Test Accuracy: {accuracy_student}')
print(f'Student Model Test F1 Score: {f1_student}')
print(f'Student Model Test ROC-AUC Score: {roc_auc_student}')
print('Classification Report:')
print(report_student)

Cross-Validation F1 Scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation F1 Score: 1.0
Student Model Test Accuracy: 1.0
Student Model Test F1 Score: 1.0
Student Model Test ROC-AUC Score: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     65023
           1       1.00      1.00      1.00     64977

    accuracy                           1.00    130000
   macro avg       1.00      1.00      1.00    130000
weighted avg       1.00      1.00      1.00    130000

