In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib

# Load preprocessed data
X_train = pd.read_csv('C:/Users/chigu/Desktop/stroke_prediction_project/Data/X_train_preprocessed.csv')
X_test = pd.read_csv('C:/Users/chigu/Desktop/stroke_prediction_project/Data/X_test_preprocessed.csv')
y_train = pd.read_csv('C:/Users/chigu/Desktop/stroke_prediction_project/Data/y_train_preprocessed.csv').values.ravel()
y_test = pd.read_csv('C:/Users/chigu/Desktop/stroke_prediction_project/Data/y_test_preprocessed.csv').values.ravel()

# Initialize models
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=500),
    'SVM': SVC(kernel='linear', probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'NaiveBayes': GaussianNB()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f'Training {name}...')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Compute evaluation metrics
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[name] = {'Accuracy': acc, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}
    
    # Save the best model (based on F1 Score)
    if name == 'RandomForest':
        joblib.dump(model, 'C:/Users/chigu/Desktop/stroke_prediction_project/Models/stroke_model.pkl')
    
    print(f'{name} - Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}\n')

# Display results
results_df = pd.DataFrame(results).T
print(results_df)

# Save results
results_df.to_csv('C:/Users/chigu/Desktop/stroke_prediction_project/Models/model_performance.csv', index=True)

print("\nModel training completed successfully!")

Training RandomForest...
RandomForest - Accuracy: 0.9022, Precision: 0.1212, Recall: 0.1600, F1 Score: 0.1379

Training XGBoost...
XGBoost - Accuracy: 0.8992, Precision: 0.1045, Recall: 0.1400, F1 Score: 0.1197

Training LogisticRegression...
LogisticRegression - Accuracy: 0.7857, Precision: 0.1494, Recall: 0.7200, F1 Score: 0.2474

Training SVM...
SVM - Accuracy: 0.7759, Precision: 0.1434, Recall: 0.7200, F1 Score: 0.2392

Training KNN...
KNN - Accuracy: 0.7945, Precision: 0.0876, Recall: 0.3400, F1 Score: 0.1393

Training NaiveBayes...
NaiveBayes - Accuracy: 0.7172, Precision: 0.1030, Recall: 0.6200, F1 Score: 0.1766

                    Accuracy  Precision  Recall  F1 Score
RandomForest        0.902153   0.121212    0.16  0.137931
XGBoost             0.899217   0.104478    0.14  0.119658
LogisticRegression  0.785714   0.149378    0.72  0.247423
SVM                 0.775930   0.143426    0.72  0.239203
KNN                 0.794521   0.087629    0.34  0.139344
NaiveBayes          0.71