In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
df.isnull().sum()

**Checking for imbalanced dataset**

In [None]:
df["Outcome"].value_counts()

**EDA**

In [None]:
df.hist(figsize=(30,30))

In [None]:
import seaborn as sns
sns.distplot(df["Pregnancies"],color="darkred",bins=40)
sns.catplot(x="Outcome",y="Pregnancies",kind="violin",data=df)

**From above graph it shows that more impact of pregnancies for diabetes lies between 0-5**

In [None]:
sns.distplot(df["Glucose"],color="darkred",bins=40)
sns.catplot(x="Outcome",y="Glucose",kind="violin",data=df)


**Glucose is main source for diabetes from above graph it shows glucose level**

In [None]:
sns.distplot(df["SkinThickness"],color="darkred",bins=40)
sns.catplot(x="Outcome",y="SkinThickness",kind="violin",data=df)

In [None]:
sns.distplot(df["SkinThickness"],color="darkred",bins=40)

In [None]:
sns.distplot(df["Insulin"])

In [None]:
sns.catplot(x="Outcome",y="Insulin",kind="violin",data=df)

In [None]:
sns.distplot(df["BMI"])
sns.catplot(x="Outcome",y="BMI",kind="violin",data=df)

**BMI lies between 20-55 contains diabetes**

In [None]:
sns.distplot(df["DiabetesPedigreeFunction"])
sns.catplot(x="Outcome",y="DiabetesPedigreeFunction",kind="violin",data=df)

In [None]:
sns.distplot(df["Age"])


In [None]:
sns.distplot(df["Age"],kde=False,color="darkred",bins=40)

In [None]:
sns.catplot(x="Outcome",y="Age",kind="violin",data=df)

In [None]:
plt.figure(figsize=(30,25))
sns.heatmap(df.corr(),cmap="Dark2", annot=True,)
plt.show()

In [None]:
df1=df.copy()

**Replacing "0" with "NaN" as "0" doesn't contribute so considering it as Null values**

In [None]:
df1[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']]=df1[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0,np.NaN)

In [None]:
df1.isnull().sum()

**Imputing null values with mean values.**

In [None]:
df1["Glucose"].fillna(df1["Glucose"].mean(),inplace=True)
df1["BloodPressure"].fillna(df1["BloodPressure"].mean(),inplace=True)
df1["SkinThickness"].fillna(df1["SkinThickness"].mean(),inplace=True)
df1["Insulin"].fillna(df1["Insulin"].mean(),inplace=True)
df1["BMI"].fillna(df1["BMI"].mean(),inplace=True)

In [None]:
df1.isnull().sum()

**Checking with outliers in data with the help of Boxplots**

In [None]:
for col in df1.iloc[:,:-1].columns:
    print(col)
    sns.boxplot(x=df1[col],data=df1)
    plt.show()

In [None]:
sns.pairplot(df1,hue='Outcome', diag_kind="hist");
plt.tight_layout()

**Transforming outliers with IQR**

In [None]:
def boxoutlier(var):
    for x in var.iloc[:,:-1].columns :        
        Q1=var[x].quantile(0.25)
        Q3=var[x].quantile(0.75)
        IQR=Q3-Q1
        Lower = Q1-(1.5*IQR)
        Upper = Q3+(1.5*IQR)
        var.loc[:,x]=np.where(var[x].values > Upper,Upper,var[x].values)
        var.loc[:,x]=np.where(var[x].values < Lower,Lower,var[x].values)
        
    return var
df1=boxoutlier(df1)

In [None]:
for col in df1.iloc[:,:-1].columns:
    print(col)
    sns.boxplot(x=df1[col],data=df1)
    plt.show()

In [None]:
X=df1.drop(["Outcome"],axis=1)
Y=df1["Outcome"] 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

In [None]:
def models(X_train,Y_train):
    
    ###Logistics Regression
    from sklearn.linear_model import LogisticRegression
    le=LogisticRegression()
    log_re=le.fit(X_train,Y_train)
    
    
    ### Random Forest
    from sklearn.ensemble import RandomForestClassifier
    rclf=RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 10)
    RF=rclf.fit(X_train,Y_train)
    
    ###KNN
    from sklearn.neighbors import KNeighborsClassifier
    knn=KNeighborsClassifier(n_neighbors=7)
    KNN=knn.fit(X_train,Y_train)
    
    ###SVM
    from sklearn.svm import SVC
    svl=SVC(kernel="linear",random_state=0)
    LINSVM=svl.fit(X_train,Y_train)
    
    svp=SVC(kernel="poly",random_state=0)
    POLSVM=svp.fit(X_train,Y_train)
    
    svrbf=SVC(kernel="rbf",random_state=0)
    RBFSVM=svrbf.fit(X_train,Y_train)
    
    print("[0]Logistic Regression Accuracy:",log_re.score(X_train,Y_train))

    print("[1]Random Forest:",RF.score(X_train,Y_train))
    
    print("[2]KNN:",KNN.score(X_train,Y_train))
    
    print("[3]Linear SVM:",LINSVM.score(X_train,Y_train))
    
    print("[4]Polynomial SVM:",POLSVM.score(X_train,Y_train))
    
    print("[5]RBF SVM:",RBFSVM.score(X_train,Y_train))
    
    return log_re,RF,KNN,LINSVM,POLSVM,RBFSVM

model=models(X_train,Y_train)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
for i in range(len(model)):
    print("Model",i)
    print(classification_report(Y_test,model[i].predict(X_test)))
    print(accuracy_score(Y_test, model[i].predict(X_test)))
    print()

In [None]:
pred=model[5].predict(X_test)
print(pred)