In [59]:
import numpy as np
import pandas as pd
import pickle  # For saving and loading the model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score
import os

In [60]:
# Load the diabetes dataset
diabetes_dataset = pd.read_csv("../datasets/diabetes.csv")  # Make sure the dataset is in the same directory

In [61]:
# Display the first few rows
print("First 5 rows of dataset:")
display(diabetes_dataset.head())

# Show dataset information
print("\nDataset Information:")
diabetes_dataset.info()

# Show statistical summary
print("\nStatistical Summary:")
display(diabetes_dataset.describe())

First 5 rows of dataset:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB

Statistical Summary:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [62]:
# Replace zero values with column mean in selected columns
zero_fields = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for column in zero_fields:
    diabetes_dataset[column] = diabetes_dataset[column].replace(0, diabetes_dataset[column].mean())

print("Missing/zero values handled.")

Missing/zero values handled.


In [63]:
# Separate features (X) and target variable (y)
X = diabetes_dataset.drop(columns='Outcome', axis=1)
y = diabetes_dataset['Outcome']

print("Features and target variable separated.")

Features and target variable separated.


In [64]:
# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

Training samples: 614, Testing samples: 154


In [65]:
# Standardize the feature values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Feature scaling applied.")

Feature scaling applied.


In [66]:
# Train the SVM classifier
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, y_train)

print("SVM model trained successfully.")

SVM model trained successfully.


In [67]:
# Training accuracy
X_train_prediction = classifier.predict(X_train)
training_accuracy = accuracy_score(X_train_prediction, y_train)

# Test accuracy
X_test_prediction = classifier.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction, y_test)

print(f"Training Accuracy: {training_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

Training Accuracy: 0.79
Testing Accuracy: 0.71


In [68]:
def predict_diabetes(input_data):
    input_array = pd.DataFrame([input_data], columns=X.columns)  # Convert input to DataFrame
    input_array_scaled = scaler.transform(input_array)  # Scale the input data
    prediction = classifier.predict(input_array_scaled)  # Make a prediction
    return "Diabetic" if prediction[0] == 1 else "Not Diabetic"

In [69]:
# Example usage:
sample_input = X.iloc[0].tolist()  # Use the first row of the dataset as an example
print("Sample Prediction:", predict_diabetes(sample_input))

Sample Prediction: Diabetic


In [70]:
save_path = "../saved_models/"  # Since your notebook is inside 'training_models/'
os.makedirs(save_path, exist_ok=True)

# Save model and scaler
with open(os.path.join(save_path, "diabetes_model.sav"), "wb") as f:
    pickle.dump(classifier, f)

with open(os.path.join(save_path, "diabetes_scaler.sav"), "wb") as f:
    pickle.dump(scaler, f)

print("\n Model and scaler saved successfully in 'saved_models/' folder.")


 Model and scaler saved successfully in 'saved_models/' folder.


In [71]:
# Load saved model and scaler
try:
    with open(os.path.join(save_path, "diabetes_model.sav"), "rb") as f:
        loaded_model = pickle.load(f)
    print(" Model loaded successfully!")
except (FileNotFoundError, EOFError):
    print(" Error: Model file is missing or corrupted.")
    loaded_model = None

try:
    with open(os.path.join(save_path, "diabetes_scaler.sav"), "rb") as f:
        loaded_scaler = pickle.load(f)
    print(" Scaler loaded successfully!")
except (FileNotFoundError, EOFError):
    print(" Error: Scaler file is missing or corrupted.")
    loaded_scaler = None


 Model loaded successfully!
 Scaler loaded successfully!


In [72]:
# Test if the loaded model works correctly
sample_input = X.iloc[5].tolist()  # Use another row for testing
print("Prediction after reloading model:", predict_diabetes(sample_input))

Prediction after reloading model: Not Diabetic
