In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn.model_selection  import train_test_split
from sklearn import metrics as MTR
from sklearn.naive_bayes import GaussianNB

In [None]:
data_df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
print(data_df.shape)
print(data_df.columns)

In [None]:
data_df.info()

In [None]:
data_df.describe()

In [None]:
# handling missing Values
# replace the 0 with nan

data_df['Glucose']=data_df['Glucose'].replace(0,np.nan)
data_df['BloodPressure']= data_df['BloodPressure'].replace(0,np.nan)
data_df['SkinThickness']= data_df['SkinThickness'].replace(0,np.nan)
data_df['Insulin']= data_df['Insulin'].replace(0,np.nan)
data_df['BMI']= data_df['BMI'].replace(0,np.nan)
# imputing the missing value with median value for the respective column
data_df.fillna(data_df.median(), inplace=True)

In [None]:
# check for outlier
plt.figure(figsize=(20,20))
plt.subplot(4,4,1),
sns.boxplot(data_df['Pregnancies'])
plt.subplot(4,4,2)
sns.boxplot(data_df['Glucose'])
plt.subplot(4,4,3)
sns.boxplot(data_df['BloodPressure'])
plt.subplot(4,4,3)
sns.boxplot(data_df['SkinThickness'])
plt.subplot(4,4,4)
sns.boxplot(data_df['Insulin'])
plt.subplot(4,4,5)
sns.boxplot(data_df['BMI'])
plt.subplot(4,4,6)
sns.boxplot(data_df['DiabetesPedigreeFunction'])
plt.subplot(4,4,7)
sns.boxplot(data_df['Age'])

In [None]:
# removing outliers
data_df['Pregnancies']=data_df['Pregnancies'].clip(lower=data_df['Pregnancies'].quantile(0.05), upper=data_df['Pregnancies'].quantile(0.95))
data_df['BloodPressure']=data_df['BloodPressure'].clip(lower=data_df['BloodPressure'].quantile(0.05), upper=data_df['BloodPressure'].quantile(0.95))
data_df['SkinThickness']=data_df['SkinThickness'].clip(lower=data_df['SkinThickness'].quantile(0.07), upper=data_df['SkinThickness'].quantile(0.93))
data_df['Insulin']=data_df['Insulin'].clip(lower=data_df['Insulin'].quantile(0.21), upper=data_df['Insulin'].quantile(0.805))
data_df['BMI']=data_df['BMI'].clip(lower=data_df['BMI'].quantile(0.05), upper=data_df['BMI'].quantile(0.95))
data_df['DiabetesPedigreeFunction']=data_df['DiabetesPedigreeFunction'].clip(lower=data_df['DiabetesPedigreeFunction'].quantile(0.05), upper=data_df['DiabetesPedigreeFunction'].quantile(0.95))
data_df['Age']=data_df['Age'].clip(lower=data_df['Age'].quantile(0.05), upper=data_df['Age'].quantile(0.95))

In [None]:
# check for outlier
plt.figure(figsize=(20,20))
plt.subplot(4,4,1),
sns.boxplot(data_df['Pregnancies'])
plt.subplot(4,4,2)
sns.boxplot(data_df['Glucose'])
plt.subplot(4,4,3)
sns.boxplot(data_df['BloodPressure'])
plt.subplot(4,4,3)
sns.boxplot(data_df['SkinThickness'])
plt.subplot(4,4,4)
sns.boxplot(data_df['Insulin'])
plt.subplot(4,4,5)
sns.boxplot(data_df['BMI'])
plt.subplot(4,4,6)
sns.boxplot(data_df['DiabetesPedigreeFunction'])
plt.subplot(4,4,7)
sns.boxplot(data_df['Age'])

In [None]:
# create Correlation map
f, ax = plt.subplots(figsize=(20, 10))
corr = data_df.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax,annot=True)

In [None]:
# Pair plot analysis 
sns.pairplot(data_df,hue='Outcome',diag_kind='kde')

In [None]:
# Split Train, Test Data
features= data_df.drop(['Outcome'],axis =1)
target=data_df['Outcome']
xTrain, xTest, yTrain, yTest = train_test_split(features, target, test_size = 0.2, random_state = 23)
xTrain, xVal, yTrain, yVal = train_test_split(xTrain, yTrain, test_size = 0.2, random_state = 43)

In [None]:
# Create a Gaussian Classifier
model = GaussianNB()
model.fit(xTrain,yTrain)
predicted= model.predict(xTest)
print("Accuracy:",MTR.accuracy_score(yTest, predicted))