In [10]:
# Install required libraries if not already installed
import os
import subprocess
import sys

# Check and install the required libraries
required_libraries = ['imbalanced-learn', 'xgboost', 'joblib', 'scikit-learn', 'pandas', 'numpy']
for library in required_libraries:
    subprocess.check_call([sys.executable, "-m", "pip", "install", library])

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
from sklearn.metrics import accuracy_score

# Load Dataset
url = 'https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv'
data = pd.read_csv(url)

# Data Preprocessing
# Replace zeros with median for specific columns
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    data.loc[data[column] == 0, column] = data[column].median()

# Separate features (X) and target (y)
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Balance the classes using SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Scale the features using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the models for comparison
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(),
}

# Model Training and Evaluation
best_model = None
best_score = 0

# Train each model and evaluate its accuracy
for model_name, model in models.items():
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test) * 100  # Accuracy on test data
    print(f"{model_name} Accuracy: {accuracy:.2f}%")

    # Select the best model based on accuracy
    if accuracy > best_score:
        best_score = accuracy
        best_model = model

# Save the best model and the scaler to files using joblib
model_filename = 'diabetes_model.pkl'
scaler_filename = 'scaler.pkl'

joblib.dump(best_model, model_filename)  # Save the best model
joblib.dump(scaler, scaler_filename)     # Save the scaler

# Print the final results
print(f"\nBest Model: {best_model} with Accuracy: {best_score:.2f}%")
print(f"Model and scaler have been saved as {model_filename} and {scaler_filename}")


  data.loc[data[column] == 0, column] = data[column].median()


Logistic Regression Accuracy: 74.00%
Random Forest Accuracy: 82.50%
Support Vector Machine Accuracy: 80.50%
K-Nearest Neighbors Accuracy: 79.50%

Best Model: RandomForestClassifier() with Accuracy: 82.50%
Model and scaler have been saved as diabetes_model.pkl and scaler.pkl
