In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.combine import SMOTETomek  # To handle class imbalance

# Load the diabetes dataset
diabetes_dataset = pd.read_csv('../datasets/diabetes.csv')

# Splitting features and target
X = diabetes_dataset.drop(columns=['Outcome'])  # Features
Y = diabetes_dataset['Outcome']  # Target variable (0 or 1)

# 1. Handle Imbalanced Data using SMOTETomek
smote_tomek = SMOTETomek(random_state=42)
X_resampled, Y_resampled = smote_tomek.fit_resample(X, Y)

# 2. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

# 3. Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_resampled, test_size=0.2, random_state=42)

# 4. Hyperparameter Tuning for Different Models

# 4.1 SVM with GridSearchCV
svm_model = SVC()
svm_param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
svm_grid_search = GridSearchCV(svm_model, svm_param_grid, cv=5, scoring='accuracy')
svm_grid_search.fit(X_train, Y_train)
svm_best_model = svm_grid_search.best_estimator_
svm_y_pred = svm_best_model.predict(X_test)

# 4.2 Random Forest with GridSearchCV
rf_model = RandomForestClassifier(random_state=42)
rf_param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20, None], 'class_weight': ['balanced', None]}
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, Y_train)
rf_best_model = rf_grid_search.best_estimator_
rf_y_pred = rf_best_model.predict(X_test)

# 4.3 XGBoost with GridSearchCV
xgb_model = XGBClassifier(random_state=42)
xgb_param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5, 7]}
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='accuracy')
xgb_grid_search.fit(X_train, Y_train)
xgb_best_model = xgb_grid_search.best_estimator_
xgb_y_pred = xgb_best_model.predict(X_test)

# 5. Evaluate Models
print("SVM Best Model Accuracy:", accuracy_score(Y_test, svm_y_pred))
print("Random Forest Best Model Accuracy:", accuracy_score(Y_test, rf_y_pred))
print("XGBoost Best Model Accuracy:", accuracy_score(Y_test, xgb_y_pred))


SVM Best Model Accuracy: 0.8638743455497382
Random Forest Best Model Accuracy: 0.8691099476439791
XGBoost Best Model Accuracy: 0.8429319371727748


In [2]:
import pickle

In [3]:
filename='diabetes_model2.sav'
pickle.dump(rf_best_model,open(filename,'wb'))