In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Description the data
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.


# Models used in this notebook

* K-NN model
* Naive Bayes model
* SVM Model
* Decision Tree Model
* Logistic Regression
* RidgeClassifier Model

### Ensemble Models
* Random Forest Classifier
* XGBoost Classifier

This notebook for beginners, to simple implementation these models

# Import Libraries

In [None]:
# For data split
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# To evaluation the final model
from sklearn.metrics import confusion_matrix,classification_report

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Read the data

In [None]:
# read the data
diabetes_data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

#Print the first 5 rows of the dataframe.
diabetes_data.head()

In [None]:
# shape of data
diabetes_data.shape

# Missing Values and data type check

In [None]:
# check for the missing values
diabetes_data.isna().sum()

In [None]:
# check for the missing values and data type in one step

diabetes_data.info()

Great !

*No missing values and all data type are numerical without any categorical variables*

# Data visualization

In [None]:
diabetes_data.describe().T

In [None]:
feature_name = [name for name in diabetes_data.columns.values]
plt.figure(figsize=(16,16))
for i,name in enumerate(diabetes_data.columns.values[:-1]):
    ax = plt.subplot((len (feature_name)-1)/2, 2, i + 1)
    sns.scatterplot(y=diabetes_data[name], x=diabetes_data[diabetes_data.columns.values[-1]])
    plt.ylabel(feature_name[i])
    plt.xlabel(feature_name[-1])


# Select the features and Target

Data Split to train and test datasets

In [None]:
X = diabetes_data.drop(['Outcome'],axis=1)
y = diabetes_data['Outcome']

In [None]:
# split the data to train and test or train and test and vaildation

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

### K-NN Model

In [None]:
KNeighborsClassifierModel = KNeighborsClassifier(n_neighbors=2)

# train the model
KNeighborsClassifierModel.fit(X_train,y_train)

print ('The train score: ',KNeighborsClassifierModel.score(X_train,y_train))
print ('The test score: ',KNeighborsClassifierModel.score(X_test,y_test))

### Naive Bayes (GaussianNB) Model

In [None]:
GaussianNBModel = GaussianNB()

# train the model
GaussianNBModel.fit(X_train,y_train)

print ('The train score: ',GaussianNBModel.score(X_train,y_train))
print ('The test score: ',GaussianNBModel.score(X_test,y_test))

### SVC Model

In [None]:
SVCModel = SVC()

# train the model
SVCModel.fit(X_train,y_train)

print ('The train score: ',SVCModel.score(X_train,y_train))
print ('The test score: ',SVCModel.score(X_test,y_test))

### DecisionTreeClassifier Model

In [None]:
# Descion tree model

DecisionTreeClassifierModel = DecisionTreeClassifier()

# train the model
DecisionTreeClassifierModel.fit(X_train,y_train)

print ('The train score: ',DecisionTreeClassifierModel.score(X_train,y_train))
print ('The test score: ',DecisionTreeClassifierModel.score(X_test,y_test))

The test score is less than the train score, which is means ***overfit***

### Logistic regression Model

In [None]:
# Logistic regression

LogisticRegressionModel = LogisticRegression(max_iter=1000)
LogisticRegressionModel.fit(X_train,y_train)
print ('the train score: ',LogisticRegressionModel.score(X_train,y_train))
print ('the test score: ',LogisticRegressionModel.score(X_test,y_test))

In [None]:
RidgeClassifierModel = RidgeClassifier()
RidgeClassifierModel.fit(X_train,y_train)
print ('the train score: ',RidgeClassifierModel.score(X_train,y_train))
print ('the test score: ',RidgeClassifierModel.score(X_test,y_test))

# Ensemble Models
### Random Forest Classifier Model

In [None]:
RandomForestClassifierModel = RandomForestClassifier()
RandomForestClassifierModel.fit(X_train,y_train)

print ('Train Score: ',RandomForestClassifierModel.score(X_train,y_train))
print ('Test Score: ',RandomForestClassifierModel.score(X_test,y_test))

### XGBoosting Classifier Model

In [None]:
XGBClassifierModel = XGBClassifier()
XGBClassifierModel.fit(X_train,y_train)

print ('Train Score: ',XGBClassifierModel.score(X_train,y_train))
print ('Test Score: ',XGBClassifierModel.score(X_test,y_test))

### Final Model

In [None]:
Final_Model = RandomForestClassifier()
Final_Model.fit(X_train,y_train)

print ('Train Score: ',Final_Model.score(X_train,y_train))
print ('Test Score: ',Final_Model.score(X_test,y_test))

### Predicitons

In [None]:
predicitons = Final_Model.predict(X_test)
print ('The Last 10 True Outcome: ',y_test[-10:].values)
print ('The Last 10 Predicted Outcome: ',predicitons[-10:])

### Model Evaluation

In [None]:
print (classification_report(y_test,predicitons))

In [None]:
CM = confusion_matrix(y_test,predicitons)
CM

### Comments

* The accuracy is ***not* accepted**, the goal of this notebook was just how the implementation of these models and evaluations.

* We can split the data to train, validation and test, to apply the hyperparameter tuning to improve the accuracy