In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import joblib

In [12]:
# Load the diabetes dataset into a pandas DataFrame
def load_dataset(file_path):
    try:
        diabetes_dataset = pd.read_csv(file_path)
        return diabetes_dataset
    except Exception as e:
        print(f"Failed to load dataset: {e}")

file_path = r"D:\Work_Station\Project_CICD_Pipeline\src\data\diabetes.csv"
diabetes_dataset = load_dataset(file_path)


In [13]:
diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [14]:
# Explore the dataset
def explore_dataset(diabetes_dataset):
    print(diabetes_dataset.head())
    print(diabetes_dataset.info())
    print(diabetes_dataset['Outcome'].value_counts())
    print(diabetes_dataset.groupby('Outcome').mean())

explore_dataset(diabetes_dataset)


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

In [15]:
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [16]:
# Prepare the data for training
def prepare_data(diabetes_dataset):
    # Separate the data and labels
    X = diabetes_dataset.drop(columns='Outcome', axis=1)
    Y = diabetes_dataset['Outcome']

    # Standardize the data
    scaler = StandardScaler()
    scaler.fit(X)
    standardized_data = scaler.transform(X)
    X = standardized_data

    # Split the data into training and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

    return X_train, X_test, Y_train, Y_test, scaler

X_train, X_test, Y_train, Y_test, scaler = prepare_data(diabetes_dataset)
print(f'X shape: {X_train.shape[0] + X_test.shape[0]}, X_train shape: {X_train.shape}, X_test shape: {X_test.shape}')


X shape: 768, X_train shape: (614, 8), X_test shape: (154, 8)


## Training the model

In [17]:
# Train an SVM classifier
def train_classifier(X_train, Y_train):
    classifier = svm.SVC(kernel='linear')
    classifier.fit(X_train, Y_train)
    return classifier

classifier = train_classifier(X_train, Y_train)


### Evaluating the Model

In [18]:
# Evaluate the classifier
def evaluate_classifier(classifier, X_train, Y_train, X_test, Y_test):
    # Accuracy on training data
    X_train_prediction = classifier.predict(X_train)
    training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
    print(f'Accuracy on Training data: {training_data_accuracy}')

    # Accuracy on test data
    X_test_prediction = classifier.predict(X_test)
    test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
    print(f'Accuracy score of the test data: {test_data_accuracy}')

evaluate_classifier(classifier, X_train, Y_train, X_test, Y_test)


Accuracy on Training data: 0.7866449511400652
Accuracy score of the test data: 0.7727272727272727


### Save the models

In [26]:
# Save the classifier and scaler
def save_models(classifier, scaler, model_filename, scaler_filename):
    joblib.dump(classifier, filename=model_filename)
    joblib.dump(scaler, filename=scaler_filename)

model_filename = 'src/models/model.pkl'
scaler_filename = 'src/models/scaler.pkl'
save_models(classifier, scaler, model_filename, scaler_filename)


In [27]:
# Make predictions with the classifier
def make_prediction(classifier, scaler, input_data):
    # Convert input data to a numpy array and reshape
    input_data_as_numpy_array = np.asarray(input_data)
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

    # Standardize the input data
    std_data = scaler.transform(input_data_reshaped)
    print(f'Standardized input data: {std_data}')

    # Make prediction
    prediction = classifier.predict(std_data)
    print(f'Prediction: {prediction}')

    if prediction[0] == 0:
        print('The person is not diabetic')
    else:
        print('The person is diabetic')




In [28]:
# Example predictions
input_data1 = (1, 85, 66, 29, 0, 26.6, 0.351, 31)
make_prediction(classifier, scaler, input_data1)


Standardized input data: [[-0.84488505 -1.12339636 -0.16054575  0.53090156 -0.69289057 -0.68442195
  -0.36506078 -0.19067191]]
Prediction: [0]
The person is not diabetic




In [29]:
input_data2 = (6, 148, 72, 35, 0, 33.6, 0.627, 50)
make_prediction(classifier, scaler, input_data2)

Standardized input data: [[ 0.63994726  0.84832379  0.14964075  0.90726993 -0.69289057  0.20401277
   0.46849198  1.4259954 ]]
Prediction: [1]
The person is diabetic


