In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
from matplotlib import rcParams
import missingno as msno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Let's ignore warnings for now
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing data and verfiying

df=pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
df.head()

# **Data Descriptions:**

* Pregnancies = No. of times a pregnancy has occurred
* Glucose = Plasma glucose concentration a 2 hours in an oral glucose tolerance test
* BloodPressure = Diastolic blood pressure (mm Hg)
* SkinThickness = Triceps skin fold thickness (mm)
* Insuling = 2-Hour serum insulin (mu U/ml)
* BMI = Body mass index (weight in kg/(height in m)^2)
* DiabetesPedigreeFunction = Diabetes pedigree function
* Age = Age (in years)
* Outcome = Class variable (0 or 1) 268 of 768 are 1, the others are 0

In [None]:
# General Analysis of the data

df.describe(),df.shape

In [None]:
# Checking the data types

df.dtypes

In [None]:
# Checking for missing values

df.isnull().sum()

In [None]:
# While we can see above that the dataset does not have null values, we can see that it still has ZERO values which may not make a 
# lot of sense for fields such as Glucose, Blood Pressure, Skin Thickness, Insulin, BMI. We want to replace ZERO with Nan so that it
# reflects as missing values

df[["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]] = df[["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]].replace({0:np.nan})
df.isnull().sum()

In [None]:
# Replacing null values with mean

df['Glucose'] = df['Glucose'].fillna(value=df['Glucose'].mean())
df['BloodPressure'] = df['BloodPressure'].fillna(value=df['BloodPressure'].mean())
df['SkinThickness'] = df['SkinThickness'].fillna(value=df['SkinThickness'].mean())
df['Insulin'] = df['Insulin'].fillna(value=df['Insulin'].mean())
df['BMI'] = df['BMI'].fillna(value=df['BMI'].mean())
df.isnull().sum()

In [None]:
# Seeing the spread of people with diabetes (Our Target feature -> Outcome)

fig = plt.figure(figsize=(20,1))
sns.countplot(data=df,y='Outcome')
df['Outcome'].value_counts()

In [None]:
# Checking correlation between various parameters

rcParams["figure.figsize"] = 20,10
plt.title("Corellation between different features")
sns.heatmap(df.corr(),annot=True,cmap="YlGnBu")

In [None]:
df.hist(figsize = (20,20))

In [None]:
# Scaling the data

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X =  pd.DataFrame(sc_X.fit_transform(df.drop(["Outcome"],axis = 1),),columns=['Pregnancies', 'Glucose', 'BloodPressure', 
                                                                              'SkinThickness', 'Insulin','BMI',
                                                                              'DiabetesPedigreeFunction', 'Age'])
X.head()

In [None]:
# Target feature
y=df['Outcome']
y.head()

In [None]:
## Test and train data

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42, stratify=y)

In [None]:
# Function that runs the requested algorithm and returns the accuracy metrics
def fit_ml_algo(algo, X_train, y_train, cv):
    
    # One Pass
    model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train) * 100, 2)
    
    # Cross Validation 
    train_pred = model_selection.cross_val_predict(algo, 
                                                  X_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    # Cross-validation accuracy metric
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
    
    return train_pred, acc, acc_cv

In [None]:
# Importing ML Libraries

from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Logistic Regression

train_pred_log, acc_log, acc_cv_log = fit_ml_algo(LogisticRegression(), 
                                                               X_train, 
                                                               y_train, 
                                                                    10)

print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)

In [None]:
# k-Nearest Neighbours

train_pred_knn, acc_knn, acc_cv_knn = fit_ml_algo(KNeighborsClassifier(), 
                                                  X_train, 
                                                  y_train, 
                                                  10)

print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10-Fold: %s" % acc_cv_knn)

In [None]:
# Linear SVC

train_pred_svc, acc_linear_svc, acc_cv_linear_svc = fit_ml_algo(LinearSVC(),
                                                                X_train, 
                                                                y_train, 
                                                                10)

print("Accuracy: %s" % acc_linear_svc)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svc)

In [None]:
# Stochastic Gradient Descent

train_pred_sgd, acc_sgd, acc_cv_sgd = fit_ml_algo(SGDClassifier(), 
                                                  X_train, 
                                                  y_train,
                                                  10)

print("Accuracy: %s" % acc_sgd)
print("Accuracy CV 10-Fold: %s" % acc_cv_sgd)

In [None]:
# SVM

train_pred_svm, acc_linear_svm, acc_cv_linear_svm = fit_ml_algo(SVC(),
                                                                X_train, 
                                                                y_train, 
                                                                10)

print("Accuracy: %s" % acc_linear_svm)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svm)

In [None]:
# Decision Tree Classifier

train_pred_decision, acc_linear_decision, acc_cv_linear_decision = fit_ml_algo(DecisionTreeClassifier(),
                                                                X_train, 
                                                                y_train, 
                                                                10)

print("Accuracy: %s" % acc_linear_decision)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_decision)

In [None]:
## Checking the importance of features

from sklearn.ensemble import RandomForestClassifier 
model= RandomForestClassifier(n_estimators=100,random_state=0)
X=df[df.columns[:8]]
Y=df['Outcome']
model.fit(X,Y)
pd.Series(model.feature_importances_,index=X.columns).sort_values(ascending=False)

# **We can see that the most important features are Glucose, BMI, Age, Diabates Pedigree Function. So we will repeat the steps above with the above features only**

In [None]:
df=df[['Glucose','BMI','DiabetesPedigreeFunction','Age','Outcome']]
df.head()

In [None]:
# Scaling the data

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()


X =  pd.DataFrame(sc_X.fit_transform(df.drop(["Outcome"],axis = 1),),columns=['Glucose','BMI','DiabetesPedigreeFunction','Age'])
X.head()

In [None]:
## Test and train data

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42, stratify=y)

In [None]:
# Logistic Regression

train_pred_log, acc_log, acc_cv_log = fit_ml_algo(LogisticRegression(), 
                                                               X_train, 
                                                               y_train, 
                                                                    10)

print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)

In [None]:
# k-Nearest Neighbours

train_pred_knn, acc_knn, acc_cv_knn = fit_ml_algo(KNeighborsClassifier(), 
                                                  X_train, 
                                                  y_train, 
                                                  10)

print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10-Fold: %s" % acc_cv_knn)

In [None]:
# Linear SVC

train_pred_svc, acc_linear_svc, acc_cv_linear_svc = fit_ml_algo(LinearSVC(),
                                                                X_train, 
                                                                y_train, 
                                                                10)

print("Accuracy: %s" % acc_linear_svc)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svc)

In [None]:
# Stochastic Gradient Descent

train_pred_sgd, acc_sgd, acc_cv_sgd = fit_ml_algo(SGDClassifier(), 
                                                  X_train, 
                                                  y_train,
                                                  10)

print("Accuracy: %s" % acc_sgd)
print("Accuracy CV 10-Fold: %s" % acc_cv_sgd)

In [None]:
# SVM

train_pred_svm, acc_linear_svm, acc_cv_linear_svm = fit_ml_algo(SVC(),
                                                                X_train, 
                                                                y_train, 
                                                                10)

print("Accuracy: %s" % acc_linear_svm)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svm)

In [None]:
# Decision Tree Classifier

train_pred_decision, acc_linear_decision, acc_cv_linear_decision = fit_ml_algo(DecisionTreeClassifier(),
                                                                X_train, 
                                                                y_train, 
                                                                10)

print("Accuracy: %s" % acc_linear_decision)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_decision)

# **We now see that the models which are consistently having the greatest accurace are SVM (Linear and Radial) and Logistic Regression. So we will restrict to compare these. **

In [None]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [None]:
from sklearn.model_selection import GridSearchCV

scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train,y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
model_df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
model_df

# **Based on above, we can conclude that the best model is Logistic Regression with C as 1**