In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
db = pd.read_csv("./diabetes(1).csv")
db.head()

In [None]:
db = db.drop_duplicates(keep='first', ignore_index=True)
db.isna().any()

In [None]:
counts = db['Outcome'].value_counts()
print(counts)
counts.sort_index().plot(kind='barh', xlabel="Counts")
plt.title("Bar Plot of Outcome Variable Counts")

In [None]:
#checking unique values
variables = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']
for i in variables:
    print(db[i].unique())

In [None]:
##checking 0 values per predictor

variables = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
for i in variables:
    c = 0
    for x in (db[i]):
        if x == 0:
            c = c + 1
    print(i,c)

In [None]:
#replacing the missing values with the mean
variables = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for i in variables:
    db[i].replace(0,db[i].mean(),inplace=True)

#checking to make sure that incorrect values are replace
for i in variables:
    c = 0
    for x in (db[i]):
        if x == 0:
            c = c + 1
    print(i,c)

In [None]:
#missing values
db.info()

In [None]:
#checking descriptive statistics
db.describe()

In [None]:
db.head()

# EDA

In [None]:
sns.catplot(x="Outcome", y="Age", kind="swarm", data=db)

people aged 20-30 are least susceptible to diabetes

In [None]:
fig,ax = plt.subplots(1,2,figsize=(15,5))
sns.boxplot(x='Outcome',y='Pregnancies',data=db,ax=ax[0], color='red')
sns.violinplot(x='Outcome',y='Pregnancies',data=db,ax=ax[1])

In [None]:
sns.boxplot(x='Outcome', y='Glucose', data=db).set_title('Glucose vs Diabetes')

 ### Model 1- Neural Network

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler

In [None]:
db_nn = db[['Outcome','BMI','DiabetesPedigreeFunction', 'Glucose', 'Age', 'BloodPressure' ]]

In [None]:
x = db_nn.drop('Outcome', axis = 1)
y = db_nn['Outcome']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.1, random_state = 1)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes = (20, 10),
                    learning_rate_init = 0.001,
                    max_iter = 1000,
                    activation = 'relu',
                    solver = 'sgd',
                    batch_size = 50,
                    random_state = 1)

mlp.fit(xtrain, ytrain)
pred = mlp.predict(xtest)

In [None]:
print(classification_report(ytest, pred,
                           digits = 4,
                           target_names = ["0", "1"],
                           zero_division = 1))

In [None]:
cm = confusion_matrix(ytest, pred)
sns.heatmap(cm, annot = True, fmt = "d", cmap = "Blues")
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
plt.plot(mlp.loss_curve_)
plt.title("MLP Training Loss over Epochs")
plt.xlabel('Epochs')
plt.ylabel('Training Loss')
plt.show()

### Model 2: Random Forest

In [None]:
x = db_nn.drop('Outcome', axis = 1)

scaler = MinMaxScaler(feature_range = (0, 1))
xresc = scaler.fit_transform(x)
x = pd.DataFrame(data = xresc, columns = x.columns)

y = db_nn['Outcome']

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.1, random_state = 1)

In [None]:
rf_clf = RandomForestClassifier(n_estimators = 200, criterion = 'entropy',
                                min_samples_split = 18, random_state = 1)
rf_clf.fit(xtrain, ytrain)

In [None]:
pred = rf_clf.predict(xtest)

In [None]:
accuracy = accuracy_score(ytest, pred)
print(f"Accuracy: {accuracy:.4f}")
print(f'Training MSE: {round(mean_squared_error(rf_clf.predict(xtrain), ytrain), 4)}')
print(f'Testing MSE: {round(mean_squared_error(pred, ytest), 4)}')

In [None]:
print(classification_report(ytest, pred,
                            digits = 4,
                            target_names = ["0", "1"],
                            zero_division = 1))

In [None]:
cm = confusion_matrix(ytest, pred)
sns.heatmap(cm, annot = True, fmt = "d", cmap = "Blues")
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# XGBoost

In [None]:
import xgboost as xgb

In [None]:
xgb_clf = xgb.XGBClassifier(n_estimators = 90, learning_rate = 0.35, 
                            max_depth = 3, seed = 1)
xgb_clf.fit(xtrain, ytrain)

In [None]:
pred = xgb_clf.predict(xtest)
print(xtest.head(10))
print(pred)

In [None]:
print(f'Training MSE: {round(mean_squared_error(xgb_clf.predict(xtrain), ytrain), 4)}')
print(f'Testing MSE: {round(mean_squared_error(pred, ytest), 4)}')

In [None]:
print(classification_report(ytest, pred,
                            digits = 4,
                            target_names = ["No Diabetes", "Diabetes"],
                            zero_division = 1))

In [None]:
cm = confusion_matrix(ytest, pred)
sns.heatmap(cm, annot = True, fmt = "d", cmap = "Blues")
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

acc = accuracy_score(ytest, pred)
print(f"Accuracy: {acc:.4f}")

### Model #4

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(random_state=42)
model.fit(xtrain, ytrain)
y_pred = model.predict(xtest)

In [None]:
accuracy = accuracy_score(ytest, y_pred)
conf_matrix = confusion_matrix(ytest, y_pred)
class_report = classification_report(ytest, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(conf_matrix)
print('Classification Report:')
print(class_report)

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()