In [None]:
!pip install datasist #It's not pre-installed on Kaggle

# 1.Importing Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #For customizing seaborn plots
import seaborn as sns #For plotting and data visualization
from sklearn.preprocessing import StandardScaler 
import datasist as ds #To detect outliers 
import warnings # To ignore the warnings appears
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Customizing Seaborn figure size, font size and color palette
sns.set(rc={'figure.figsize':(10,10)},font_scale=1.2)
sns.set_palette('viridis')

In [None]:
#Reading the file
df = pd.read_csv('../input/diabites-dataset/diabetes.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#Detecting outliers in all columns
outliers = ds.structdata.detect_outliers(df,0,['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'])
outliers

# Starting Visualization

In [None]:
sns.pairplot(df)

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
sns.countplot(df['Outcome'])

In [None]:
sns.countplot(df['Pregnancies'])

In [None]:
sns.kdeplot(df['Age'],shade=True)

In [None]:
sns.kdeplot(df['Age'],hue=df['Outcome'],shade=True,palette='Set1')

In [None]:
sns.jointplot(df['Age'],df['Pregnancies'])

In [None]:
sns.kdeplot(df['Glucose'],shade=True)

In [None]:
sns.kdeplot(df['BMI'],shade=True)

In [None]:
sns.violinplot(y = df['Age'],x = df['Pregnancies'],hue=df['Outcome'],split=True)

In [None]:
sns.jointplot(df['Pregnancies'],df['Insulin'])

In [None]:
sns.jointplot(df['Age'],df['Insulin'])

# Starting Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score , f1_score,confusion_matrix,classification_report

In [None]:
no_outliers_df = df[~df.index.isin(outliers)]
no_outliers_df

In [None]:
x = no_outliers_df.drop('Outcome',axis=1)
y = no_outliers_df['Outcome']

In [None]:
x_train , x_test, y_train, y_test = train_test_split(x,y,random_state=42,test_size=0.2)

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [None]:
models = [LogisticRegression(),DecisionTreeClassifier(),RandomForestClassifier(),SVC(kernel='poly'),MLPClassifier(hidden_layer_sizes=(128),max_iter=100)]

In [None]:
for model in models:
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print(f"Model {model}")
    print(f"Accuracy score is {accuracy_score(y_test,y_pred)}")
    print(f"F1 score is {f1_score(y_test,y_pred)}")
    print(f"Confusin Matrix\n{confusion_matrix(y_test,y_pred)}")
    print(f"Classification report\n{classification_report(y_test,y_pred)}")
    print("-------------------------------------------------------")