# Diabetes Prediction

###  Dataset Link: https://www.kaggle.com/johndasilva/diabetes

# Load Dataset

In [None]:
#Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/diabetes/diabetes.csv')

# Exploring Dataset

In [None]:
df.head()  #Returns first 5 rows of DataFrame, if parameter is passed i.e. head(n) it returns 'n' rows of DataFrame

In [None]:
df.shape   #Returns shape of DataFrame i.e. number of rows and columns  

We can observe that the dataset contain 2000 rows and 9 columns.

In [None]:
df.columns   #Returns the column name of DataFrame

'Outcome' is the column which we are going to predict, which says if the patient is diabetic or not. 1 means the person is diabetic and 0 means a person is not.

In [None]:
df.groupby('Outcome').size()

We can identify that out of 2000 persons, 1316 are labeled as 0(non-diabetic) and 684 as 1(diabetic).

In [None]:
df.dtypes     #Returns the data type of each column

In [None]:
df.info()    #Returns a concise summary of DataFrame

In [None]:
df.describe().T    #Returns basic statistics on all numeric columns

In [None]:
df.isna().any()    #Returns True if there is missing value, else False

We can observe that there are no data points missing in the dataset.

In [None]:
df = df.rename(columns={'BloodPressure':'BP', 'DiabetesPedigreeFunction':'DPF'})

In [None]:
df.head()

In [None]:
sns.countplot(x='Outcome', data=df)
plt.xlabel('Diabetic')
plt.ylabel('Count')
plt.xticks([0, 1], ['No', 'Yes'])
plt.show()

# Data Cleaning

In [None]:
df.hist(figsize=(15,15))
plt.show()

When analyzing the histograms we can identify that there are some outliers in some columns.Lets analyze those outliers further.

### Handling Outliers

**Blood Pressure(BP):** By observing the data we can see that there are 0 values for BP and it is evident that the readings of the dataset seem wrong because a living person cannot have a diastolic BP of 0.Lets observe the data and find the count of 0 readings.

In [None]:
print('Total:',df[df['BP']==0].shape[0])

By observing the data we can see 90 counts where the value is 0.

In [None]:
df[df['BP']==0].groupby('Outcome')['Age'].count()

**Glucose Levels:** Even after fasting glucose levels can't be zero.Therefore zero is an invalid reading.

In [None]:
print('Total:',df[df['Glucose']==0].shape[0])

By observing the data we can see 13 counts where the value is 0.

In [None]:
df[df['Glucose']==0].groupby('Outcome')['Age'].count()

**SkinThickness:** For normal people skin fold thickness can't be less than 10mm better yet zero.

In [None]:
print('Total:',df[df['SkinThickness']==0].shape[0])

By observing the data we can see 573 counts where the value is 0.

In [None]:
df[df['SkinThickness']==0].groupby('Outcome')['Age'].count()

**BMI:** Shouldn't be zero or close to zero unless the person is really underweight which could be life-threatening

In [None]:
print('Total:', df[df['BMI']==0].shape[0])

By observing the data we can see 28 counts where the value is 0.

In [None]:
df[df['BMI']==0].groupby('Outcome')['Age'].count()

**Insulin:** In a rare situation a person can have zero insulin.Lets observe our dataset.

In [None]:
print('Total:', df[df['Insulin']==0].shape[0])

By observing tha data we can find that there is total of 956 counts.

In [None]:
df[df['Insulin']==0].groupby('Outcome')['Age'].count()

Lets replace all the 0s with NaN and then put their mean/median values according to our observations.

In [None]:
df[['Glucose', 'BP', 'SkinThickness', 'Insulin',
         'BMI', 'DPF']] = df[['Glucose', 'BP', 'SkinThickness', 'Insulin',
       'BMI', 'DPF']].replace(0, np.NaN)

In [None]:
df.head()

In [None]:
df['Glucose'].fillna(df['Glucose'].mean(), inplace=True)
df['BP'].fillna(df['BP'].mean(), inplace=True)
df['SkinThickness'].fillna(df['SkinThickness'].median(), inplace=True)
df['Insulin'].fillna(df['Insulin'].median(), inplace=True)
df['BMI'].fillna(df['BMI'].median(), inplace=True)

In [None]:
df.head()

In [None]:
df.hist(figsize=(15,15))
plt.show()

In [None]:
df.isnull().sum()

# Model Selection

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
X = df.drop(columns=['Outcome'])

y = df['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=df.Outcome, random_state=0)

print('X_train size: {}, X_test size: {}'.format(X_train.shape, X_test.shape))

In [None]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
def best_model(X, y):
    models = {
        'LR': {
            'model': LogisticRegression(solver='lbfgs', multi_class='auto'),
            'parameters': {
                'C': [1,5,10]
               }
        },
       
        'DT': {
            'model': DecisionTreeClassifier(splitter='best'),
            'parameters': {
                'criterion': ['gini', 'entropy'],
                'max_depth': [5,10]
            }
        },
        
        'RF': {
            'model': RandomForestClassifier(criterion='gini'),
            'parameters': {
                'n_estimators': [10,15,20,50,100,200]
            }
        },
         
        'KNN': {
            'model': KNeighborsClassifier(algorithm='auto'),
            'parameters': {
                'n_neighbors': [5,10,15,20,25],
                'weights' : ['uniform', 'distance'] 
               }
        },
        
        'SVC': {
            'model': SVC(gamma='auto'),
            'parameters': {
                'C': [1,10,20],
                'kernel': ['rbf','linear']
            }
        },
        
        'GB': {
            'model': GradientBoostingClassifier(criterion='friedman_mse'),
            'parameters': {
                'loss': ['deviance', 'exponential']
               }
        }

    }
    
    scores = [] 
    cv_shuffle = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)
        
    for model_name, model_params in models.items():
        gs = GridSearchCV(model_params['model'], model_params['parameters'], cv = cv_shuffle, return_train_score=False)
        gs.fit(X, y)
        scores.append({
            'model': model_name,
            'best_parameters': gs.best_params_,
            'score': gs.best_score_
        })
        
    return pd.DataFrame(scores, columns=['model', 'best_parameters', 'score'])

best_model(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(RandomForestClassifier(n_estimators=20, random_state=0), X_train, y_train, cv=5)
print('Average Accuracy : {}%'.format(round(sum(scores)*100/len(scores)), 3))

In [None]:
classifier = RandomForestClassifier(n_estimators=20, random_state=0)
classifier.fit(X_train, y_train)

# Model Evaluation

In [None]:
# Confusion matrix for test set
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
plt.figure(figsize=(10,7))
p = sns.heatmap(cm, annot=True, cmap="Purples", fmt='g')
plt.title('Confusion matrix for Random Forest Classifier Model - Test Set')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()

In [None]:
score = round(accuracy_score(y_test, y_pred),4)*100
print("Accuracy on test set: {}%".format(score))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Creating a confusion matrix for training set
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_pred_train = classifier.predict(X_train)
cm = confusion_matrix(y_train, y_pred_train)
cm

In [None]:
plt.figure(figsize=(10,7))
p = sns.heatmap(cm, annot=True, cmap="Purples", fmt='g')
plt.title('Confusion matrix for Random Forest Classifier Model - Train Set')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.show()

In [None]:
score = round(accuracy_score(y_train, y_pred_train),4)*100
print("Accuracy on trainning set: {}%".format(score))

In [None]:
print(classification_report(y_train, y_pred_train))

# Model Predictions

In [None]:
def predict_diabetes(Pregnancies, Glucose, BP, SkinThickness, Insulin, BMI, DPF, Age):
    preg = int(Pregnancies)
    glucose = float(Glucose)
    bp = float(BP)
    st = float(SkinThickness)
    insulin = float(Insulin)
    bmi = float(BMI)
    dpf = float(DPF)
    age = int(Age)

    x = [[preg, glucose, bp, st, insulin, bmi, dpf, age]]
    x = ss.transform(x)

    return classifier.predict(x)

In [None]:
prediction = predict_diabetes(4, 85, 79, 18, 72, 28.3, 0.498, 27)[0]
if prediction:
  print('Sorry! You have diabetes.')
else:
  print("Voila! You don't have diabetes.")

In [None]:
prediction = predict_diabetes(6, 122, 95, 13, 88, 24.7, 0.491, 63)[0]
if prediction:
  print('Sorry! You have diabetes.')
else:
  print("Voila! You don't have diabetes.")

In [None]:
prediction = predict_diabetes(1, 120, 90, 27, 149, 33.4, 0.393, 42)[0]
if prediction:
  print('Sorry! You have diabetes.')
else:
  print("Voila! You don't have diabetes.")