In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pickle

# Load diabetes dataset
diabetes_dataset = pd.read_csv("../datasets/diabetes.csv")

# Exploratory Data Analysis
print(diabetes_dataset.head())
print(diabetes_dataset.shape)
print(diabetes_dataset.describe())

# Fixing groupby mean error
numeric_cols = diabetes_dataset.select_dtypes(include=['number']).columns
print(diabetes_dataset.groupby('Outcome')[numeric_cols].mean())

# Splitting features and target
X_diabetes = diabetes_dataset.drop(columns=['Outcome'], axis=1)
y_diabetes = diabetes_dataset['Outcome']

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X_diabetes, y_diabetes, test_size=0.2, random_state=42)

# Training SVM model
diabetes_model = SVC(kernel='linear')
diabetes_model.fit(X_train, Y_train)

# Making predictions
y_pred = diabetes_model.predict(X_test)

# Evaluating model
print("Diabetes Prediction Accuracy:", accuracy_score(Y_test, y_pred))

# Saving model
filename = 'diabetes_model.sav'
pickle.dump(diabetes_model, open(filename, 'wb'))


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
(768, 9)
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.

In [3]:
print("Feature Columns:")
for column in X_diabetes.columns:
    print(column)


Feature Columns:
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pickle

# Load diabetes dataset
diabetes_dataset = pd.read_csv("../datasets/diabetes.csv")

# Exploratory Data Analysis
print(diabetes_dataset.head())
print(diabetes_dataset.shape)
print(diabetes_dataset.describe())

# Splitting features and target
X_diabetes = diabetes_dataset.drop(columns=['Outcome'], axis=1)
y_diabetes = diabetes_dataset['Outcome']

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X_diabetes, y_diabetes, test_size=0.2, random_state=42)

# Standardize the features for Logistic Regression and other models that might benefit
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# List of models to train
models = {
    'SVM': SVC(kernel='linear'),
    'Logistic Regression': LogisticRegression(max_iter=200),  # Increase max_iter to help with convergence
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Loop over models to train and evaluate them
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Choose scaled data for models that benefit from scaling (Logistic Regression, KNN)
    if model_name in ['Logistic Regression', 'K-Nearest Neighbors']:
        model.fit(X_train_scaled, Y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
    
    # Accuracy score
    accuracy = accuracy_score(Y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(Y_test, y_pred)
    print(f"Confusion Matrix for {model_name}:\n{conf_matrix}\n")
    
    # Save the model with the prefix 'diabetes_'
    filename = f'diabetes_{model_name.lower().replace(" ", "_")}_model.sav'
    pickle.dump(model, open(filename, 'wb'))


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
(768, 9)
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.