In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

In [2]:
# Load the diabetes dataset into a pandas DataFrame
def load_dataset(file_path):
    try:
        diabetes_dataset = pd.read_csv(file_path)
        return diabetes_dataset
    except Exception as e:
        print(f"Failed to load dataset: {e}")

file_path = r"D:\Work_Station\Project_CICD_Pipeline\src\data\diabetes.csv"
diabetes_dataset = load_dataset(file_path)


In [3]:
diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
# Explore the dataset
def explore_dataset(diabetes_dataset):
    print(diabetes_dataset.head())
    print(diabetes_dataset.info())
    print(diabetes_dataset['Outcome'].value_counts())
    print(diabetes_dataset.groupby('Outcome').mean())

explore_dataset(diabetes_dataset)


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

In [5]:
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
def prepare_data_with_val(diabetes_dataset):
    # Separate features and target
    X = diabetes_dataset.drop(columns='Outcome', axis=1)
    Y = diabetes_dataset['Outcome']

    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split into train (70%) and temp (20%)
    X_train, X_temp, Y_train, Y_temp = train_test_split(
        X, Y, test_size=0.2, stratify=Y, random_state=2
    )

    # Split temp into validation (10%) and test (10%)
    X_val, X_test, Y_val, Y_test = train_test_split(
        X_temp, Y_temp, test_size=0.5, stratify=Y_temp, random_state=2
    )

    return X_train, X_val, X_test, Y_train, Y_val, Y_test, scaler



## Step 2: Define a Function to Evaluate Model Performance

In [7]:
def evaluate_classifier(classifier, X_train, Y_train, X_val, Y_val, X_test, Y_test):
    train_pred = classifier.predict(X_train)
    val_pred = classifier.predict(X_val)
    test_pred = classifier.predict(X_test)

    train_acc = accuracy_score(Y_train, train_pred)
    val_acc = accuracy_score(Y_val, val_pred)
    test_acc = accuracy_score(Y_test, test_pred)

    return train_acc, val_acc, test_acc

## Step 3: Train Default Model and Evaluate Before Hyperparameter Tuning

In [8]:
def train_default_model(model, X_train, Y_train, X_val, Y_val, X_test, Y_test):
    model.fit(X_train, Y_train)
    train_acc, val_acc, test_acc = evaluate_classifier(model, X_train, Y_train, X_val, Y_val, X_test, Y_test)
    print(f"Before tuning - Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}")
    return train_acc, val_acc, test_acc

## Step 4: Hyperparameter Tuning Using Validation Set with GridSearchCV and Custom CV Split

In [9]:
from sklearn.model_selection import GridSearchCV

def tune_hyperparameters_with_val(model, param_grid, X_train, Y_train, X_val, Y_val):
    # Combine train and validation sets
    X_combined = np.vstack((X_train, X_val))
    Y_combined = pd.concat([Y_train.reset_index(drop=True), Y_val.reset_index(drop=True)], ignore_index=True)

    # Indices for train and validation samples in combined data
    train_indices = list(range(len(X_train)))
    val_indices = list(range(len(X_train), len(X_train) + len(X_val)))

    # Custom CV split: train on train_indices, validate on val_indices
    custom_cv = [(train_indices, val_indices)]

    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=custom_cv,
        scoring='accuracy',
        n_jobs=-1,
        refit=True
    )

    grid.fit(X_combined, Y_combined)
    print(f"Best hyperparameters: {grid.best_params_}")
    return grid.best_estimator_


## Step 5: Evaluate the Tuned Model

In [10]:
def evaluate_tuned_model(best_model, X_train, Y_train, X_val, Y_val, X_test, Y_test):
    train_acc, val_acc, test_acc = evaluate_classifier(best_model, X_train, Y_train, X_val, Y_val, X_test, Y_test)
    print(f"After tuning - Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}")
    return train_acc, val_acc, test_acc


## Step 6: Complete Workflow for Multiple Models

In [11]:
def train_and_tune_models(X_train, Y_train, X_val, Y_val, X_test, Y_test):
    models = {
        'SVM': svm.SVC(),
        'LogisticRegression': LogisticRegression(max_iter=1000),
        'RandomForest': RandomForestClassifier(random_state=2)
    }

    param_grids = {
        'SVM': {
            'kernel': ['linear', 'rbf'],
            'C': [0.1, 1, 10],
            'gamma': ['scale', 'auto']
        },
        'LogisticRegression': {
            'C': [0.01, 0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        },
        'RandomForest': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5]
        }
    }

    results = {}

    for model_name, model in models.items():
        print(f"\n--- {model_name} ---")

        # Step 3: Train default model
        train_acc, val_acc, test_acc = train_default_model(model, X_train, Y_train, X_val, Y_val, X_test, Y_test)

        # Step 4: Tune hyperparameters using validation set
        best_model = tune_hyperparameters_with_val(model, param_grids[model_name], X_train, Y_train, X_val, Y_val)

        # Step 5: Evaluate tuned model
        train_acc_tuned, val_acc_tuned, test_acc_tuned = evaluate_tuned_model(best_model, X_train, Y_train, X_val, Y_val, X_test, Y_test)

        results[model_name] = {
            'before': (train_acc, val_acc, test_acc),
            'after': (train_acc_tuned, val_acc_tuned, test_acc_tuned),
            'best_params': best_model.get_params(),
            'best_estimator': best_model
        }

    return results


## Step 7: Evaluating the Model

In [12]:
X_train, X_val, X_test, Y_train, Y_val, Y_test, scaler = prepare_data_with_val(diabetes_dataset)
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")

results = train_and_tune_models(X_train, Y_train, X_val, Y_val, X_test, Y_test)

Train shape: (614, 8), Validation shape: (77, 8), Test shape: (77, 8)

--- SVM ---
Before tuning - Train Acc: 0.8290, Val Acc: 0.6883, Test Acc: 0.7662
Best hyperparameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
After tuning - Train Acc: 0.7818, Val Acc: 0.7662, Test Acc: 0.7792

--- LogisticRegression ---
Before tuning - Train Acc: 0.7850, Val Acc: 0.7273, Test Acc: 0.7922
Best hyperparameters: {'C': 0.01, 'solver': 'liblinear'}
After tuning - Train Acc: 0.7736, Val Acc: 0.7403, Test Acc: 0.7792

--- RandomForest ---
Before tuning - Train Acc: 1.0000, Val Acc: 0.7273, Test Acc: 0.7273
Best hyperparameters: {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 200}
After tuning - Train Acc: 0.8550, Val Acc: 0.8442, Test Acc: 0.7662


In [13]:
print("\nSummary of model performances:")
for model_name, res in results.items():
    print(f"{model_name}:")
    print(f"  Before tuning - Train Acc: {res['before'][0]:.4f}, Val Acc: {res['before'][1]:.4f}, Test Acc: {res['before'][2]:.4f}")
    print(f"  After tuning  - Train Acc: {res['after'][0]:.4f}, Val Acc: {res['after'][1]:.4f}, Test Acc: {res['after'][2]:.4f}")
    print(f"  Best params: {res['best_params']}")



Summary of model performances:
SVM:
  Before tuning - Train Acc: 0.8290, Val Acc: 0.6883, Test Acc: 0.7662
  After tuning  - Train Acc: 0.7818, Val Acc: 0.7662, Test Acc: 0.7792
  Best params: {'C': 0.1, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
LogisticRegression:
  Before tuning - Train Acc: 0.7850, Val Acc: 0.7273, Test Acc: 0.7922
  After tuning  - Train Acc: 0.7736, Val Acc: 0.7403, Test Acc: 0.7792
  Best params: {'C': 0.01, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
RandomForest:
  Before tuning - Train Acc: 1.0000, Val Ac

In [16]:
best_model = results['SVM']['best_estimator']

## Step8: Save the models

In [17]:
import joblib

def save_models(classifier, scaler, model_filename, scaler_filename):
    joblib.dump(classifier, filename=model_filename)
    joblib.dump(scaler, filename=scaler_filename)
    print(f"Model saved to {model_filename}")
    print(f"Scaler saved to {scaler_filename}")

In [18]:
model_filename = 'src/models/svm_model.pkl'
scaler_filename = 'src/models/scaler.pkl'
save_models(best_model, scaler, model_filename, scaler_filename)


Model saved to src/models/svm_model.pkl
Scaler saved to src/models/scaler.pkl


In [19]:
def make_prediction(classifier, scaler, input_data):
    # Convert input data to numpy array and reshape
    input_data_as_numpy_array = np.asarray(input_data)
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

    # Standardize the input data using the scaler
    std_data = scaler.transform(input_data_reshaped)
    print(f'Standardized input data: {std_data}')

    # Make prediction
    prediction = classifier.predict(std_data)
    print(f'Prediction: {prediction}')

    if prediction[0] == 0:
        print('The person is not diabetic')
    else:
        print('The person is diabetic')

In [20]:
# Load saved model and scaler
model_filename = 'src/models/svm_model.pkl'
scaler_filename = 'src/models/scaler.pkl'

loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load(scaler_filename)


## Step 9: Make Predictions on New Input Data

In [21]:
# Example input data (replace with your values)
input_data1 = (1, 85, 66, 29, 0, 26.6, 0.351, 31)
make_prediction(loaded_model, loaded_scaler, input_data1)

Standardized input data: [[-0.84488505 -1.12339636 -0.16054575  0.53090156 -0.69289057 -0.68442195
  -0.36506078 -0.19067191]]
Prediction: [0]
The person is not diabetic




In [22]:
input_data2 = (6, 148, 72, 35, 0, 33.6, 0.627, 50)
make_prediction(loaded_model, loaded_scaler, input_data2)

Standardized input data: [[ 0.63994726  0.84832379  0.14964075  0.90726993 -0.69289057  0.20401277
   0.46849198  1.4259954 ]]
Prediction: [1]
The person is diabetic


