<h1>Diabetes Prediction Model - Classification Method</h1>

In [None]:
## Installing dependencies
%pip install pandas numpy scikit-learn seaborn matplotlib scipy scikit-learn

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("diabetes_prediction_dataset.csv")

In [None]:
## Exploring our dataset
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
## Checking if dataset contains any null values
df.isnull()

In [None]:
## CHanging object to binary integers 1 or 0
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["gender"] = label_encoder.fit_transform(df['gender'])

In [None]:
df['gender'].head()  ## Male 1 Female 0

In [None]:
df.info()

In [None]:
## Checking the importance of smoking history column
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=df, x='smoking_history', hue='diabetes')  ## In our case this column is not much impact on whether someone has diabetes or not.


In [None]:
## Performing Chi-Square test to determin significant relationship between two categorical variables.
## Creating continguency table to summarixe the count of occurrences for each combination.
contingeny_table = pd.crosstab(df['smoking_history'], df['diabetes'])
contingeny_table

In [None]:
## Chi-Squre Test
from scipy.stats import chi2_contingency

chi2, p, dof, expected = chi2_contingency(contingeny_table)

print(f"Chi-Square Statistic: {chi2}")
print(f"p-value: {p}") ## If p-value < 0.05, the column is significant keep it otherwise it can be dropped because it's not significant
print(f"Degrees of Freedom: {dof}")

In [None]:
## Keep the smoking_history column
## Checking the natural order

df['smoking_history'].unique()

In [None]:
## There seems to be a natural order here but, still unsure use OHE
df = pd.get_dummies(df, columns=["smoking_history"], drop_first=True)

In [None]:
df.head()

In [None]:
## Cross checking the coloumns data types
df.info()

In [None]:
from sklearn.model_selection import train_test_split

## Divinding dataset into Independent & Dependent
X = df.drop('diabetes', axis=1)

In [None]:
## Getting Dependent features
y = df["diabetes"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
## Importing the classification model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)  ## max_iter() param allows the model more iteration to converge

<p>Standard Scaling transforms the features in dataset so that each feature has a mean of 0 and a standard deviation of 1.</p>

In [None]:
## Doing standard scaling - feature scaling technique that standardizes the features by removing the mean and scaling to unit variance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
## Train the model
model.fit(X_train, y_train)

<h2>Now, doing Predictions</h2>

In [None]:
y_pred = model.predict(X_test)

In [None]:
## Evaluating models performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print(f"Classification report:\n{classification_report(y_test, y_pred)}")

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")

<h5>Training Model on Unseen Data</h5>

In [None]:
unseen_data = pd.DataFrame(
    {
        "gender": ["Male", "Female", "Male", "Female", "Male", "Female"],
        "age": [50, 42, 63, 56, 38, 36],
        "hypertension": [0, 1, 1, 0, 0, 1],
        "heart_disease": [1, 0, 1, 0, 0, 1],
        "smoking_history": ["former", "never", "current", "former", "never", "never"],
        "bmi": [28.5, 31.2, 34.0, 29.4, 25.8, 29.71],
        "HbA1c_level": [6.3, 5.9, 7.2, 5.7, 5.6, 10.3],
        "blood_glucose_level": [140, 120, 160, 110, 100, 130],
    }
)

In [None]:
unseen_data.info()

In [None]:
## CHanging object to binary integers 1 or 0
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
unseen_data["gender"] = label_encoder.fit_transform(unseen_data["gender"])

In [None]:
unseen_data["gender"].head()

In [None]:
## OHE for smoking_history column
unseen_data = pd.get_dummies(unseen_data, columns=["smoking_history"], drop_first=True)

In [None]:
## Converting numpy array to DataFrame to avoid attribute error at next cell
X_train_df = pd.DataFrame(
    X_train,
    columns=[
        "gender",
        "age",
        "hypertension",
        "heart_disease",
        "bmi",
        "HbA1c_level",
        "blood_glucose_level",
        "smoking_history_current",
        "smoking_history_ever",
        "smoking_history_former",
        "smoking_history_never",
        "smoking_history_not current",
    ],
)

In [None]:
unseen_data = unseen_data.reindex(columns=X_train_df.columns, fill_value=0)

In [None]:
## Scaling the data
unseen_data_scaled = scaler.transform(unseen_data)

In [None]:
## Making predictions
predictions = model.predict(unseen_data_scaled)

In [None]:
## Predictions of New Data
predictions

In [None]:
for i, prediction in enumerate(predictions):
    print(f"Patient {i+1} is {'Diabetic' if prediction == 1 else 'Non-Diabetic'}")

<h5>Evaluating Model Performance on Unseen_Data</h5>

In [None]:
## Adding target label to unseen_data
unseen_data["diabetes"] = predictions

In [None]:
## Evaluating models performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(unseen_data["diabetes"], predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

print(
    f"Classification report:\n{classification_report(unseen_data['diabetes'], predictions)}"
)

print(f"Confusion matrix:\n{confusion_matrix(unseen_data['diabetes'], predictions)}")

In [None]:
## Analysing which columns is contributing the most to our model

coefficients = model.coef_[0] ## Unlike RandomForest, coefficients are used to calculate the importance

feature_importance_df = pd.DataFrame(
    {"Feature": X_train_df.columns, "Importance": coefficients}
).sort_values("Importance", ascending=False)


feature_importance_df

In [None]:
## Plotting the graph of the above feature analysis
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df["Feature"], feature_importance_df["Importance"])
plt.xlabel('Importance')
plt.title('Features Importance')
plt.gca().invert_yaxis()  # Inverting y axis so it start most important comes first
plt.show()


<h1>Diabetes Prediction Model - Random Forest Method</h1>

In [None]:
%pip install pandas numpy seaborn matplotlib scipy scikit-learn imbalanced-learn

In [None]:
import pandas as pd
df = pd.read_csv("diabetes_prediction_dataset.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull()

In [None]:
## OHE on categorical columns, gender & smoking_history
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split

## Divinding dataset into Independent & Dependent
X = df.drop("diabetes", axis=1)

In [None]:
## Getting Dependent features
y = df["diabetes"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
## Doing standard scaling - feature scaling technique that standardizes the features by removing the mean and scaling to unit variance.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
## Importing Random FOrest algorithm
from sklearn.ensemble import RandomForestClassifier

## Initializing Random Forest with class_weight for embalancing classes
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

In [None]:
rf.fit(X_train, y_train)

In [None]:
## Making predictions on test data
y_pred = rf.predict(X_test)

In [None]:
## Evaluating model's accuracy
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy {accuracy * 100:.2f}%")

print(f"Classification report:\n{classification_report(y_test, y_pred)}")

<h5>Training Model on Unseen Data</h5>

In [None]:
unseen_data = pd.DataFrame(
    {
        "gender": ["Male", "Female", "Male", "Female", "Male", "Female"],
        "age": [50, 42, 63, 56, 38, 36],
        "hypertension": [0, 1, 1, 0, 0, 1],
        "heart_disease": [1, 0, 1, 0, 0, 1],
        "smoking_history": ["former", "never", "current", "former", "never", "never"],
        "bmi": [28.5, 31.2, 34.0, 29.4, 25.8, 29.71],
        "HbA1c_level": [6.3, 5.9, 7.2, 5.7, 5.6, 10.3],
        "blood_glucose_level": [140, 120, 160, 110, 100, 130],
    }
)

In [None]:
unseen_data.info()

In [None]:
unseen_data = pd.get_dummies(unseen_data, drop_first=True)

In [None]:
columns = [
    "age",
    "hypertension",
    "heart_disease",
    "bmi",
    "HbA1c_level",
    "blood_glucose_level",
    "gender_Male",
    "gender_Other",
    "smoking_history_current",
    "smoking_history_ever",
    "smoking_history_former",
    "smoking_history_never",
    "smoking_history_not current",
]

unseen_data = unseen_data.reindex(columns=columns, fill_value=0)

In [None]:
unseen_data_scaled = scaler.transform(unseen_data[columns])

In [None]:
# Convert unseen_data to NumPy array to overcome column name warning
#unseen_data_array = unseen_data.values

In [None]:
## Making predictions
predictions = rf.predict(unseen_data_scaled)

In [None]:
## Predictions of New Data
predictions

In [None]:
for i, prediction in enumerate(predictions):
    print(f"Patient {i+1} is {'Diabetic' if prediction == 1 else 'Non-Diabetic'}")

<h5>Evaluating Model Performance on Unseen_Data</h5>

In [None]:
## Adding target label to unseen_data
unseen_data['diabetes'] = predictions

In [None]:
## Evaluating models performance on Unseen_Data
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(unseen_data['diabetes'], predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

print(
    f"Classification report:\n{classification_report(unseen_data['diabetes'], predictions)}"
)

print(f"Confusion matrix:\n{confusion_matrix(unseen_data['diabetes'], predictions)}")

<h5>Feature's Columns Importance Analysis</h5>

In [None]:
## Analysing which columns is contributing the most to our model
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame(
    {"Feature": columns, "Importance": importances}
).sort_values(by="Importance", ascending=False)


feature_importance_df

In [None]:
## Plotting the graph of the above feature analysis
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df["Feature"], feature_importance_df["Importance"])
plt.xlabel("Importance")
plt.title("Features Importance")
plt.gca().invert_yaxis()  # Inverting y axis so it start most important comes first
plt.show()

<h1>Diabetes Prediction Model - Artificial Neural Network</h1>

In [None]:
%pip install tensorflow scikit-learn numpy pandas

In [None]:
import pandas as pd

df = pd.read_csv('diabetes_prediction_dataset.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull()

In [None]:
df.info()

In [None]:
## OHE on categorical columns, gender & smoking_history
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.info()

In [None]:
X = df.drop("diabetes", axis=1)

In [None]:
y = df["diabetes"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

<h5>Builiding Neural Network</h5>

In [None]:
## Importing Neural Network libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
## Building Neural Network
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)), # input layer, input_shape specifies the no. of features in dataset
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification problem for output layer
])

In [None]:
## Compiling the model
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
## Training model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=30, batch_size=32, verbose=1)

<h5>Evaluating Model</h5>

In [None]:
## Evaluating model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f'Test Loss: {test_loss * 100:.2f}%, Test Accuracy: {test_accuracy * 100:.2f}%')

<h5>Making predictions on new data</h5>

In [None]:
unseen_data = pd.DataFrame(
    {
        "gender": ["Male", "Female", "Male", "Female", "Male", "Female"],
        "age": [50, 42, 63, 56, 38, 36],
        "hypertension": [0, 1, 1, 0, 0, 1],
        "heart_disease": [1, 0, 1, 0, 0, 1],
        "smoking_history": ["former", "never", "current", "former", "never", "never"],
        "bmi": [28.5, 31.2, 34.0, 29.4, 25.8, 29.71],
        "HbA1c_level": [6.3, 5.9, 7.2, 5.7, 5.6, 10.3],
        "blood_glucose_level": [140, 120, 160, 110, 100, 130],
    }
)

In [None]:
unseen_data.info()

In [None]:
unseen_data = pd.get_dummies(unseen_data, drop_first=True)

In [None]:
columns = [
    "age",
    "hypertension",
    "heart_disease",
    "bmi",
    "HbA1c_level",
    "blood_glucose_level",
    "gender_Male",
    "gender_Other",
    "smoking_history_current",
    "smoking_history_ever",
    "smoking_history_former",
    "smoking_history_never",
    "smoking_history_not current",
]

unseen_data = unseen_data.reindex(columns=columns, fill_value=0)

In [None]:
unseen_data_scaled = scaler.transform(unseen_data[columns])

In [None]:
## making predictions
predictions = model.predict(unseen_data_scaled)

In [None]:
# Convert probabilities to binary labels using threshold 0.5, to avoid the wrong result in output for patient 3
binary_predictions = (predictions > 0.5).astype(int)

In [None]:
for i, prediction in enumerate(binary_predictions):
    print(f"Patient {i+1} is {'Diabetic' if prediction == 1 else 'Non-Diabetic'}")

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Model Accuracy')
plt.show()