# My First Project

#### On this Jupyter Notebook, you will find me attempting to get into the world of ML coding using Python and Sci-kit + NumPy and Pandas. And since I am also interested in the medical industry I decided to try and recreate the famous diabetes predictor. 

In [2]:
# Import all needed libraries and algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import joblib
import numpy as np

#### Here we are using the Pandas library to open the dataset that I obtained from the famous site of datasets called Kaggle. Pandas stands for 'Panel data' and allows us to view data the same way Excel does i.e. as a data frame.

In [5]:
df = pd.read_csv('diabetes.csv')
df['Outcome'].value_counts() # Checking how many of each outcome there is in the dataset.
                             #  0 means "Non-diabetic" and 1 means "Diabetic" in this binary classification problem

Outcome
0    500
1    268
Name: count, dtype: int64

### Data Preprocessing

In [8]:
# for the data preprocessing part I learnt that 0 values have to be replaced with not a number initially then 
# should be replaced by the mean value of that column
# data cleaning is the most important part of a machine learning lifecycle because 
# Garbage in = Garbage out

# define the feature columns

Feature_columns = ["Pregnancies", "Glucose" , "BloodPressure" , "SkinThickness" , "Insulin" , "BMI", "DiabetesPedigreeFunction", "Age"]

df[Feature_columns] = df[Feature_columns].replace(0,pd.NA)
for column in Feature_columns:
    df[column].fillna(df[column].mean(), inplace = True)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.000000,148.0,72.0,35.00000,155.548223,33.6,0.627,50,1
1,1.000000,85.0,66.0,29.00000,155.548223,26.6,0.351,31,0
2,8.000000,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1.000000,89.0,66.0,23.00000,94.000000,28.1,0.167,21,0
4,4.494673,137.0,40.0,35.00000,168.000000,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10.000000,101.0,76.0,48.00000,180.000000,32.9,0.171,63,0
764,2.000000,122.0,70.0,27.00000,155.548223,36.8,0.340,27,0
765,5.000000,121.0,72.0,23.00000,112.000000,26.2,0.245,30,0
766,1.000000,126.0,60.0,29.15342,155.548223,30.1,0.349,47,1


In [116]:
# This allows us to check the correlation between each variable and the final outcome (Diabetic or not)
# As we can see anything > 0 shows a positive correlation where the stronger the variable is the more likely for the person to be diabetic
# Anything <0 indicates a negative correlation so when that variable increases the other one decreases
# Glucose has the highest effect on the outcome with a 0.47 correlation value
df.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [11]:
# split data set into a target and attributes
X = df.drop('Outcome', axis = 1)
y = df['Outcome']

### Feature Scaling

In [13]:
# Standardization of data to remove any bias
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_scaled)
joblib.dump(scaler,"scaler.sav") # saving the scaler just in case

# splitting the data into training and test data and then making the model using the logistic regression algorithm
X_train, X_test, y_train , y_test = train_test_split(X_scaled,y, test_size=0.2, random_state=42)
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

Unfortunately I ran into a problem which I spent ages trying to solve, although my model was working fine predicting Diabetic outcomes correctly with an accuracy of 77.5 Percent but whenever I input data manually it would always give a "Diabetic" result. It also shows that the ML model is 88% percent confident with the Non-diabetic results which is even more confusing!

In [17]:
# This was done to check how the model behaves when faced with the first 20 rows of the preprocessed data
loaded_model = joblib.load("diabetes_model.sav")

y_pred = loaded_model.predict(X_scaled)
comparison = pd.DataFrame({'True Label': y, 'Predicted Label': y_pred})
comparison['Correct'] = comparison['True Label'] == comparison['Predicted Label']
print(comparison.head(20))  # View the first 20 comparisons

# The means of each column
means = {
    "Glucose": df['Glucose'].mean(),
    "BloodPressure": df['BloodPressure'].mean(),
    "SkinThickness": df['SkinThickness'].mean(),
    "Insulin": df['Insulin'].mean(),
    "BMI": df['BMI'].mean()
}

# Defining manual data input as test cases
test_cases = [
    np.array([1, 95, 70, 20, 85, 22.5, 0.3, 25]),  # Likely Non Diabetic
    np.array([2, 130, 80, 25, 100, 28.5, 0.5, 40]), # Borderline Diabetic
    np.array([4, 180, 90, 30, 150, 32.0, 0.8, 55]), # Clearly Diabetic
    np.array([0, 85, 65, 18, 70, 20.0, 0.1, 22]),  # Likely Non Diabetic, Young and Fit
    np.array([5, 110, 85, 27, 130, 30.0, 0.7, 50])  # High Risk, But Lower Glucose
]

# Preprocess and predict each test case
for i, input_data in enumerate(test_cases):
    # Replace zeros with mean values where applicable
    input_data[1] = means['Glucose'] if input_data[1] == 0 else input_data[1]
    input_data[2] = means['BloodPressure'] if input_data[2] == 0 else input_data[2]
    input_data[3] = means['SkinThickness'] if input_data[3] == 0 else input_data[3]
    input_data[4] = means['Insulin'] if input_data[4] == 0 else input_data[4]
    input_data[5] = means['BMI'] if input_data[5] == 0 else input_data[5]

    # Reshape and scale the input data
    input_data_reshaped = input_data.reshape(1, -1)
    input_data_scaled = scaler.transform(input_data_reshaped)

    # Make the prediction
    prediction = loaded_model.predict(input_data_scaled)
    probabilities = loaded_model.predict_proba(input_data_scaled)
    
    # Print the results
    print(f"Test Case {i+1} Prediction: {'Diabetic' if prediction[0] == 1 else 'Non Diabetic'}")
    print(f"Predicted Probabilities: {probabilities}\n")
    
print(f"Predicted Probabilities: {probabilities}")
threshold = 0.9 # Adjust this value to see its effect
prediction = (probabilities[:, 1] > threshold).astype(int)
print(f"Prediction with Adjusted Threshold: {'Diabetic' if prediction[0] == 1 else 'Non Diabetic'}")

    True Label  Predicted Label  Correct
0            1                1     True
1            0                0     True
2            1                1     True
3            0                0     True
4            1                1     True
5            0                0     True
6            1                0    False
7            0                0     True
8            1                1     True
9            1                0    False
10           0                0     True
11           1                1     True
12           0                1    False
13           1                1     True
14           1                1     True
15           1                0    False
16           1                1     True
17           1                0    False
18           0                0     True
19           1                0    False
Test Case 1 Prediction: Diabetic
Predicted Probabilities: [[0. 1.]]

Test Case 2 Prediction: Diabetic
Predicted Probabilities: [[0. 1.]]

T

### Model Testing and Accuracy

In [125]:
# cross validation improving accuracy of model as it allows for data to be split into k number of folds like buckets
scores = cross_val_score(LogisticRegression(random_state=42),X_scaled, y, cv=10)
avg = np.mean(scores) * 100
print(f" Mean score: {avg.round(2)}%")

 Mean score: 77.48%


##### I understood that the cross_val_score function is basically doing what is going on in the code below: 

In [18]:
"""
       model = LogisticRegression()
       skf = StratifiedKFold(n_splits=10)
       scores = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
model.fit(X_train, y_train)
scores.append(model.score(X_test,y_test))

"""

'\n       model = LogisticRegression()\n       skf = StratifiedKFold(n_splits=10)\n       scores = []\n\nfor train_index, test_index in skf.split(X, y):\n    X_train, X_test = X.iloc[train_index], X.iloc[test_index]\n    y_train, y_test = y.iloc[train_index], y.iloc[test_index]\n    \nmodel.fit(X_train, y_train)\nscores.append(model.score(X_test,y_test))\n\n'

In [20]:
# saving the model
filename = "diabetes_model.sav"
joblib.dump(model,filename)

['diabetes_model.sav']