# This notebook is aimed at performing exploratory data analysis on the diabetic trends of the Pima Indian tribe and developing a model to predict their diabetic trends

In [None]:
#importing relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
#importing raw data from CSV
raw_data = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
raw_data.info()

In [None]:
raw_data.describe()

In [None]:
#Probability of 1s to all outcomes
raw_data['Outcome'].sum()/raw_data['Outcome'].count()

Therefore in our data set we have significantly more 0 outcomes (65%) than 1(35%). Let us perform some exploratory data analysis to consider balancing and pre-processing

In [None]:
#Setting the Quantile limit to 80%
quantile_limit = 0.8

In [None]:
no_diabetes = raw_data[raw_data['Outcome'] == 0].copy()
no_diabetes.describe()

In [None]:
#From the previous analyses we can see there are a lot of outliers in Insulin
sns.distplot(no_diabetes['Insulin'])

In [None]:
quantile_insuline = no_diabetes['Insulin'].quantile(quantile_limit)
quantile_insuline

In [None]:
#Also from the previous analyses we can see there are a lot of outliers in Age
sns.distplot(no_diabetes['Age'])

In [None]:
quantile_age = no_diabetes['Age'].quantile(quantile_limit)
quantile_age

Therefore we can conclude that <Quantile_limit X 100>% of the data lies within the age of *quantile_age* and insulin quantity of *quantile_insulin*. Therefore we can try discarding zero quantities outside the criteria

In [None]:
data_norm = raw_data.copy()

In [None]:
#Discarding 0 outcomes above quantile_age
data_norm = data_norm[(data_norm['Outcome'] == 1) | (data_norm['Age'] <= quantile_age)]

In [None]:
data_norm.describe()

In [None]:
#Probability of 1s to all outcomes
data_norm['Outcome'].sum()/data_norm['Outcome'].count()

Therefore we can see that there is a slight increase in the balance of 1s and 0s. Exploring further...

In [None]:
#Checking 0 data points
no_diabetes = data_norm[raw_data['Outcome'] == 0].copy()
no_diabetes.describe()

In [None]:
quantile_insulin = no_diabetes['Insulin'].quantile(quantile_limit)
quantile_insulin

In [None]:
#Discarding 0 outcomes above Insulin 130
data_norm = data_norm[(data_norm['Outcome'] == 1) | (data_norm['Insulin'] <= quantile_insulin)]
#Probability of 1s to all outcomes
data_norm['Outcome'].sum()/data_norm['Outcome'].count()

Therefore, after removing 0 class outliers from Insulin dosage and Age we get a balanced 
55%(0)-45%(1) distribution for 0 to 1 for the following data

In [None]:
data_pp = data_norm.copy()

In [None]:
data_pp.info()

In [None]:
data_pp.describe()

Onto our Model now

In [None]:
#raw inputs
unscaled_X = data_pp.drop('Outcome',axis=1)
#outputs/targets
y = data_pp['Outcome']

In [None]:
unscaled_X.head()

Scaling the input

In [None]:
scaler = StandardScaler()

In [None]:
X = scaler.fit_transform(unscaled_X)
X

Splitting of train-test data and creating our model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=108)

In [None]:
#Instantiating our model
pima_diabetes_model = LogisticRegression()

In [None]:
#Fitting and training our model
pima_diabetes_model.fit(X_train,y_train)

Evaluating our model

In [None]:
#Evaluating our model on training data
pima_diabetes_model.score(X_train, y_train)

In [None]:
#Evaluating our model on test data
pima_diabetes_model.score(X_test, y_test)

In [None]:
#Evaluating accuracy - Part1
y_pred = pima_diabetes_model.predict(X_test)

In [None]:
#Evaluating accuracy - Part2
print(classification_report(y_test,y_pred))

Therefore, our model has a decent accuracy score of 88%, with greater precision at detecting True diabetics and greater recall at detecting False diabetics