In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Basic libraries
import numpy as np
import pandas as pd
import seaborn as sns
import random 
import matplotlib.pyplot as plt
sns.set(color_codes=True)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# Library to split data
from sklearn.model_selection import train_test_split

# To build model for prediction
from sklearn.linear_model import LogisticRegression

# To get diferent metric scores
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    plot_confusion_matrix,
    precision_recall_curve,
    roc_curve,
    make_scorer,
    classification_report
)

# Libraries to build decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# To tune different models
from sklearn.model_selection import GridSearchCV

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df.head()

In [None]:
#checking the file layout
df.info()

In [None]:
#shape of the file

df.shape

dataset has 768 records and 9 columns

In [None]:
#any nulls in file
df.isnull().sum()

No Nulls in the file

In [None]:
#any dups in file
df.duplicated().sum()

No dups in the file

In [None]:
# file statistics
df.describe().T

* All the various parameters - Pregancies, Glucose, Bloodpressure,SkinThickness, Insulin, BMI, DiabetesPedigreeFunction has integer values and data looks like valid. Some of the columns are skewed
* Patients age range from 21 thru' 80. Most of them are of 29 yrs

In [None]:
#unique value count in file
df.nunique()

In [None]:
#checking to see if any of the columns have negatives
(df<0).sum()

none of the columns have negative values

# **************************
# EDA
# ***********************

### Univariate Analysis

In [None]:
#For Univariate analysis, let's write a function to combine boxplot and histplot in same space one below other 
#so that we could visualize outliers & distributions

def hist_box_plot(feature, figsize=(10, 6), bins=None):
    """Boxplot and histogram combined. Input is Numerical feature
    feature: 1-d feature array
    figsize: size of fig (default (9,8))
    bins: number of bins (default None / auto)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
                                            nrows=2,  # Number of rows of the subplot grid= 2
                                            sharex=True,  # x-axis will be shared among all subplots
                                            gridspec_kw={"height_ratios": (0.25, 0.75)},
                                            figsize=figsize,
                                          )  # creating the 2 subplots
    #f2.suptitle('Univariate Analysis of {}'.format(col),fontsize=14,fontweight='bold')
    # For boxplot. Marker indicates mean value of column.  
    sns.boxplot(feature, ax=ax_box2, showmeans=True, color="orange")  
    
    # For histogram
    sns.distplot(feature, kde=True, ax=ax_hist2, bins=bins, color='indigo')
    
    ax_hist2.axvline(feature.mean(), color="green", linestyle="--", label='Mean')  # Add mean to the histogram
    ax_hist2.axvline(feature.median(), color="red", linestyle="-", label='Median')  # Add median to the histogram
    
    plt.legend() #display legend

In [None]:
# Pie chart
def pie_plot(data, z):
    """
    This function will plot pie chart and displays legend, adds % on each pie
    """
    plt.figure(figsize=(6,5))
    pie_data=data[z].value_counts()        #get the unique value counts of the variable which is to be plot
    colors=['skyblue','lightpink','gold','blue','lavender']  #define the colors you want
    if z=='Education':
        explode = (0, 0.1, 0)      #to allow wedges to standout. Education has three unique values
    else:
        explode = (0, 0.1)        # rest of other boolean variables have only 2 values 0 & 1
    labl_text=data[z].unique()    #set the labels
    plt.title(z, fontweight='bold',fontsize=15) #Chart title
    plt.pie(pie_data, 
            labels=labl_text, 
            explode=explode,      #set the predefined explode parameter
            autopct='%1.2f%%',    # display % on each pie
            shadow=True,          # shadow to appear for each widge
            frame=True,           # visible frame for pie chart
            colors=colors,
            wedgeprops={'linewidth': 1.5, 'edgecolor': 'white'},  #setting up wedge properties
            textprops={'size': 'x-large'}          #textsize in pie
            ) 
    plt.tight_layout()
    plt.legend()
    plt.show()


In [None]:
df.columns

In [None]:
hist_box_plot(df['Pregnancies'])
hist_box_plot(df['Glucose'])
hist_box_plot(df['BloodPressure'])
hist_box_plot(df['SkinThickness'])
hist_box_plot(df['Insulin'])
hist_box_plot(df['BMI'])
hist_box_plot(df['DiabetesPedigreeFunction'])
hist_box_plot(df['Age'])
pie_plot(df, 'Outcome')

### Bivariate Analysis

#### Pairplot

In [None]:
sns.pairplot((df), hue='Outcome',palette='rocket_r')
plt.show()

#### Heatmap and correlation

In [None]:
#Check the correlation of numeric variables and generate heatmap
plt.figure(figsize = (10,6))
#create your own palette to show positive values in blue and negative values in red
cmap=sns.diverging_palette(10, 255, as_cmap=True)
#plot heatmap
sns.heatmap(df.corr(), annot=True, cmap=cmap, vmin=-1, vmax=1)
plt.title('Heatmap', fontweight='bold')         #Chart title

#Correlation matrix
df.corr()

In [None]:
#converting Outcome to categorical variable
df['Outcome']=df['Outcome'].astype('category')

df.info()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='Age',data=df,palette='rocket',hue='Outcome',dodge=False).set(title='Age vs Outcome')  
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='Pregnancies',data=df,palette='rocket',hue='Outcome',dodge=False).set(title='Pregnancies vs Outcome')  
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='Glucose',data=df,palette='rocket',hue='Outcome',dodge=False).set(title='Glucose vs Outcome')  
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='BloodPressure',data=df,palette='rocket',hue='Outcome',dodge=False).set(title='BloodPressure vs Outcome')  
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='SkinThickness',data=df,palette='rocket',hue='Outcome',dodge=False).set(title='SkinThickness vs Outcome')  
plt.show()


In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='Insulin',data=df,palette='rocket',hue='Outcome',dodge=False).set(title='Insulin vs Outcome')  
plt.show()


In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='DiabetesPedigreeFunction',data=df,palette='rocket',hue='Outcome',dodge=False).set(title='DiabetesPedigreeFunction vs Outcome')  
plt.show()


In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='BMI',data=df,palette='rocket',hue='Outcome',dodge=False).set(title='BMI vs Outcome')  
plt.show()
