# Diabetes Classification 



In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# The Data

In [None]:
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Initital findings:
- There are missing values in 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin' and 'BMI' columns
- There are outliers in Glucose, BloodPressure, BMI, and DiabetesPedigreeFunction

In [None]:
col = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for i in col:
    df[i] = df[i].apply(lambda x: np.NaN if x==0 else x)
df.head()

In [None]:
print('Missing Values')
df.isna().sum()

# Handling Missing Values

In [None]:
df.drop(['Insulin', 'SkinThickness'], axis=1, inplace=True)
df.head()

In [None]:
cols = ['BMI', 'Glucose', 'BloodPressure']
for i in cols:
    df.fillna(df[i].mean(), inplace=True)

In [None]:
df.isna().sum()


In [None]:
df.corr()

In [None]:
sns.set_palette('magma')
x = np.triu(df.corr())
plt.figure(figsize=(15,7))
sns.heatmap(df.corr(), mask = x, annot=True, center=0)

In [None]:
df.columns

In [None]:
colors = sns.color_palette('tab10')
cols = list(df.columns)
cols.remove('Outcome')
fig, axs = plt.subplots(nrows=2, ncols=len(cols),figsize=(20, 10))
for i,col in enumerate(cols):
    sns.boxplot(data=df, x=col, ax = axs[0,i], color=colors[i])
    sns.histplot(data=df, x=col, ax=axs[1,i], multiple='dodge', color=colors[i], kde=True)


# Handling Outliers

In [None]:
uv = df['Pregnancies'].quantile(0.99)
df['Pregnancies'][df.Pregnancies>uv] = uv

In [None]:
uv = df.BloodPressure.quantile(0.98)
lv = df.BloodPressure.quantile(0.01)
df['BloodPressure'][df.BloodPressure>uv] = uv
df['BloodPressure'][df.BloodPressure<lv] = lv

In [None]:
uv = df.BMI.quantile(0.97)
df.BMI[df.BMI>uv] = uv


In [None]:
uv = df.DiabetesPedigreeFunction.quantile(0.96)
df.DiabetesPedigreeFunction[df.DiabetesPedigreeFunction > uv] = uv

In [None]:
colors = sns.color_palette('Set3')
sns.set_style('white')
cols = list(df.columns)
cols.remove('Outcome')

fig, axs = plt.subplots(nrows=2, ncols=len(cols), figsize=(20, 10))
for i, col in enumerate(cols):
    sns.boxplot(data=df, x=col, ax=axs[0, i], color=colors[i])
    sns.histplot(data=df, x=col, ax=axs[1, i], color=colors[i], kde=True)


In [None]:
sns.set_palette('Set2')
sns.set_style('whitegrid')
fig, axs = plt.subplots(nrows=6, figsize=(15, 25))
for i, col in enumerate(cols):
    sns.histplot(data=df, x=col, hue='Outcome', ax=axs[i], multiple='dodge')
fig.tight_layout()


In [None]:
sns.set_palette('Set3')
sns.set_style('dark')
sns.pairplot(df, hue = 'Outcome')

# Model Selection

In [None]:
from sklearn.model_selection import train_test_split
cols = list(df.columns)
cols.remove('Outcome')
X = df[cols]
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [None]:
from sklearn.metrics import classification_report, plot_confusion_matrix
sns.set_style('white')
sns.set_palette('tab10')
def model_accuracy(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print(classification_report(y_test,pred))
    plot_confusion_matrix(model, X_train, y_train)


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model1 = LogisticRegression(max_iter=180)
model_accuracy(model1, X_train, X_test, y_train, y_test)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(random_state=2)
model2.fit(X_train, y_train)
model_accuracy(model2, X_train, X_test, y_train, y_test)


## Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
model3 = GaussianNB()
model3.fit(X_train, y_train)
model_accuracy(model3, X_train, X_test, y_train, y_test)


# Model building and predictions

In [None]:
def diabetes_prediction(model):
    preg = int(input('No. of pregnancies: '))
    glu = int(input('Plasma glucose concentration a 2 hours in an oral glucose tolerance test: '))
    b_pres = int(input('Diastolic blood pressure (mm Hg): '))
    bmi = float(input('Body mass index (weight in kg/(height in m)^2): ')) 
    pedi = float(input('Diabetes Pedigree Function: '))
    age = int(input('Age: '))
    
    values = [[preg,glu,b_pres,bmi,pedi, age]]
    pred = model.predict(values)
    print('\nPrediction: ')
    for i in pred:
        if i==0:
            print('Not Diabetic')
        elif i==1:
            print('Diabetic')
            
Rand_forest = RandomForestClassifier()
Rand_forest.fit(X,y)
#diabetes_prediction(Rand_forest)