In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Let's import all the necessary Libraries required for Data Wrangling, Analysis and Classification**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline

%matplotlib inline
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix
import itertools
from sklearn.metrics import roc_curve
from sklearn.metrics import plot_confusion_matrix

## **IMPROTING DATASET - PIMA INDIANS DIABETES DATASET**

In [None]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
df.head()

## **DATA WRANGLING**

In [None]:
df.dtypes

In [None]:
df.describe()

Finding Null/NaN values...

In [None]:
print(df.isnull().sum())

****Here we can see that there is no null values but many columns in the data have zero values indicating missing values.****

In [None]:
df.corr()['Outcome'].sort_values()

### ****The correlation also tells us to find if any of the attributes contribute positively or negatively towards the diabetes. It utilizes the Pearson Coefficient.****

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), cmap="Dark2", annot= True,)
plt.show()

### Histogram Plot

**Histogram gives us the frequency of occurrence per value in the dataset. The distribution
does not occur as a bell curve in all attributes therefore, the signal is not normally
distributed.**

**The Histogram appears Skewed and major values of 'Insulin', 'BMI', 'Glucose','Blood Pressure ', 'Skin thickness' appears zero is visualized here**

In [None]:
his = df.hist(figsize = (20,20))

## IDENTIFYING AND REPLACING THE NULL VALUES

**Replacing the zero values to NaN and evaluating the null values in the attributes obtained**

In [None]:
df1 = df.copy(deep = True)
df1[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df1[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

## showing the count of Nans
print(df1.isnull().sum())

## IMPUTATION

**Imputing the Median values in the place of null values to obatin proper value distribution. Imputing can be done in both mean and median values. But since the variable is skewed, the mean is biased by the values at the far end of the distribution. Therefore, the median is a better representation of the majority of the values in the variable**

In [None]:
df1['Glucose'].fillna(df1['Glucose'].median(), inplace = True)
df1['BloodPressure'].fillna(df1['BloodPressure'].median(), inplace = True)
df1['SkinThickness'].fillna(df1['SkinThickness'].median(), inplace = True)
df1['Insulin'].fillna(df1['Insulin'].median(), inplace = True)
df1['BMI'].fillna(df1['BMI'].median(), inplace = True)

**Again plotting the Histogram and Correlation plot after the imputation of data to see the variation**

In [None]:
his1 = df1.hist(figsize = (20,20))

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df1.corr(), cmap="Dark2", annot= True,)
plt.show()

In [None]:
df1.corr()['Outcome'].sort_values()

## DATA ANALYSIS

### DATA ANALYSIS AND VISUALIZATION

Pairplot helps us to identify the relation between the attributes how one of them influces the other. 

In [None]:
plt.style.use('seaborn-dark')
sns.pairplot(df1,hue='Outcome', palette='husl', diag_kind="hist");
plt.tight_layout()

### DATA VIZUALISATION AND INTERPRETATION

**First we find the number of diabetic and non-diabetic patients. Followed by range in which diabetes occur e.g. what are possible age group or glucose level are more susceptible to diabetes**


In [None]:
df1['Outcome'].value_counts().to_frame()

In [None]:

sns.distplot(df1[df1['Outcome'] == 0]["Glucose"],color='purple' ) # Healthy - purple
sns.distplot(df1[df1['Outcome'] == 1]["Glucose"], color='yellow') # Diabetic - yellow

In [None]:
sns.distplot(df1[df1['Outcome'] == 0]["BMI"], color='purple') # Healthy - purple
sns.distplot(df1[df1['Outcome'] == 1]["BMI"], color='yellow') # Diabetic - yellow

In [None]:
sns.distplot(df1[df1['Outcome'] == 0]["Insulin"], color='purple') # Healthy - purple
sns.distplot(df1[df1['Outcome'] == 1]["Insulin"], color='yellow') # Diabetic - yellow

In [None]:
sns.distplot(df1[df1['Outcome'] == 0]["Age"], color='purple') # Healthy - purple
sns.distplot(df1[df1['Outcome'] == 1]["Age"], color='yellow') # Diabetic - yellow

In [None]:
sns.distplot(df1[df1['Outcome'] == 0]["Pregnancies"], color='purple') # Healthy - purple
sns.distplot(df1[df1['Outcome'] == 1]["Pregnancies"], color='yellow') # Diabetic - yellow

In [None]:
sns.distplot(df1[df1['Outcome'] == 0]["BloodPressure"], color='purple') # Healthy - purple
sns.distplot(df1[df1['Outcome'] == 1]["BloodPressure"], color='yellow') # Diabetic - yellow

In [None]:
sns.distplot(df1[df1['Outcome'] == 0]["SkinThickness"], color='purple') # Healthy - purple
sns.distplot(df1[df1['Outcome'] == 1]["SkinThickness"], color='yellow') # Diabetic - yellow

In [None]:
sns.distplot(df1[df1['Outcome'] == 0]["DiabetesPedigreeFunction"], color='purple') # Healthy - purple
sns.distplot(df1[df1['Outcome'] == 1]["DiabetesPedigreeFunction"], color='yellow') # Diabetic - yellow

### Interpretation of the Data Analysis

**Glucose**: Diabetic range occurs when the glucose levels at 75 mg/dl and reaches a peak at ~ 125mg/dl the data sampled. And a standard high levels of 145mg/dl glucose is observed in diabetes patients compared to the normal. Since the glucose represents the plasma glucose concentration a 2 hours in an oral glucose tolerance test. Values less than 140mg/dl are considered normal. Glucose range over 145mg/dl in the popluation sampled indicates the presence of diabetes.

**BMI**: The normal BMI range for an healthy adult is between 18.5 to 24.9. Keeping that in mind we can see that the Diabetic population has BMI range from 22 to 50, Highest number no people having ~32 BMI (30-40 BMI has higher diabetes). hence we can conclude that Obesity being a major factor.

**Insulin**: The insulin value given in dataset is 2-Hour serum insulin (mu U/ml). We can see that even though the healthy individuals as well as diabetic both have crossover and have sharp decrease after 200 mu U/ml, Diabetic patient have a sharp peak between (~ 160 -180)mu U/ml range.

**Age**: The ages between 40 to 45 years have higher chances of diabetes even though the curve starts from the age of 20 and slowly drops after the 46-47 years of age. This indicates the sedentary lifestyle and well as the living conditions are a major reason foe diabetes more than the age attribute.

**Pregnancies**: Even though women having no kids equally had a higher chance of diabetes, many of them remained healthy also. So we can conclude that pregnancy may not be a pri e factor to consider.

**Blood Pressure**: The peak values for the diabetic patients occurs at ~72 to 76 mm Hg and the range of diabetic range falls between 62 to 95 mm Hg while healthy people have till 80 mmHg 

**Skin Thickness aqnd Diabetes Pedigree function (DPF)**: We can see that the healthy and diabetic samples have similar skin thickness (Triceps skin fold thickness (mm)) range the highest being 32mm and similarly the overall mean curve of the DPF is higher for healthy compared to the diabetic. Thus these two factors dont play a greater role in predicting diabetes.

## MACHINE LEARNING MODELS USING HYPER PARAMETER TUNING

**Hyper parameter tuning is done to increase accuracy with the limited data. The following classifiers are tunes based on the estimators obtained.**

1. K- NEAREST NEIGHBOR
2. LOGISTIC REGRESSION
3. RANDOM FOREST


We are now trying to predict and find accuracy for all Data Attributes 


## ACURACY AND OTHER METRICS TABLE 

**Accuracy:**  Accuracy is the most intuitive performance measure and it is simply a ratio of correctly predicted observation to the total observations. Accuracy is a great measure but only for symmetric datasets where values of false positive and false negatives are almost same. Therefore, other parameters are estimated to evaluate the performance of your model.

**Precision:** Precision is the ratio of correctly predicted positive observations to the total predicted positive observations

**Recall:** Recall is the ratio of correctly predicted positive observations to the all observations in actual class 

**F1 Score:** F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account. Intuitively it is not as easy to understand as accuracy, but F1 is usually more useful than accuracy, especially if the dataset has an uneven class distribution. Hence F1 score is calculated here

**Jaccard Index:** The Jaccard similarity index compares members for two sets to see which members are shared and which are distinct. It’s a measure of similarity for the two sets of data, with a range from 0% to 100%. 

**Cohen’s Kappa:** Cohen’s kappa statistic measures interrater reliability (sometimes called interobserver agreement). Interrater reliability, or precision, happens when your data raters (or collectors) give the same score to the same data item.
0 = agreement equivalent to chance.
0.1 – 0.20 = slight agreement.
0.21 – 0.40 = fair agreement.
0.41 – 0.60 = moderate agreement.
0.61 – 0.80 = substantial agreement.
0.81 – 0.99 = near perfect agreement
1 = perfect agreement.

**ROC - AUC:** AUC - ROC curve is a performance measurement for classification problem at various thresholds settings. ROC is a probability curve and AUC represents degree or measure of separability. It tells how much model is capable of distinguishing between classes. Higher the AUC, better the model is at predicting 0s as 0s and 1s as 1s. 

**Confusion Matrix:** A confusion matrix is a summary of prediction results on a classification problem. The number of correct and incorrect predictions are summarized with count values and broken down by each class.

**LogLoss:** Logarithmic loss measures the performance of a classification model where the prediction input is a probability value between 0 and 1. A perfect model would have a log loss of 0. 


In [None]:
x = df1.iloc[:, :-1].values
y = df1.iloc[:, -1].values

In [None]:
x

In [None]:
y

In [None]:
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 4)
# Scaling to bring values to the same range
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)


print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)


In [None]:
########################################### KNN MODEL ############################################################
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
for n in range(1,Ks):
    neigh = KNeighborsClassifier(n_neighbors = n).fit(x_train,y_train)
    yhat = neigh.predict(x_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
    std_acc[n-1] = np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

In [None]:
plt.plot(range(1,Ks),mean_acc,'m')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.tight_layout()
plt.show()

In [None]:
# build model with best accuracy, K=6
knn_model = KNeighborsClassifier(n_neighbors=6).fit(x_train, y_train)
yhat = knn_model.predict(x_test)
mean = metrics.accuracy_score(y_test, yhat)
mean

In [None]:
# Plot Receiving Operating Characteristic Curve
    # Create true and false positive rates
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, yhat)
print('roc_auc_score: ', roc_auc_score(y_test, yhat))
# Plot ROC curves
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic- KNN Classifier')
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


In [None]:
# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix-KNN Classifier", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(knn_model, x_test, y_test,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
######################################################## LOGISTIC REGRESSION ##################################################

solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
regularisations = [1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001]
solver_mean_acc = {}
solver_std_acc = {}
solver_best_reg = {}
for solver in solvers:
    best_mean = 0
    best_std = 0
    best_reg = 0
    for reg in regularisations:
        lr = LogisticRegression(C=reg, solver=solver).fit(x_train, y_train)
        yhat = lr.predict(x_test)
        mean = metrics.accuracy_score(y_test, yhat)
        std = np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
        if mean > best_mean:
            best_mean = mean
            best_std = std
            best_reg = reg
    solver_mean_acc[solver] = best_mean
    solver_std_acc[solver] = best_std
    solver_best_reg[solver] = best_reg

solver_mean_acc

In [None]:
solver_best_reg

In [None]:
lr_model = LogisticRegression(C=1, solver='liblinear', max_iter=200).fit(x_train, y_train)
yhat = lr_model.predict(x_test)
mean = metrics.accuracy_score(y_test, yhat)
mean

In [None]:
# Plot Receiving Operating Characteristic Curve
    # Create true and false positive rates
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, yhat)
print('roc_auc_score: ', roc_auc_score(y_test, yhat))
# Plot ROC curves
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic- Logistic Regression')
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix- Logistic Regression", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(lr_model, x_test, y_test,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
############################################### RANDOM FOREST ##################################################################
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
def evaluate(model, x_test, y_test):
    yhat = model.predict(x_test)
   # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(y_test, yhat.round())
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(y_test, yhat.round())
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(y_test, yhat.round())
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_test, yhat.round(),'weighted')
    print('F1 score: %f' % f1)
    # Jaccard Index
    jaccard=jaccard_score(y_test, yhat.round(),'weighted')
    print('Jaccard: %f' % jaccard)
    # kappa
    kappa = cohen_kappa_score(y_test, yhat.round())
    print('Cohens kappa: %f' % kappa)
    # ROC AUC
    auc = roc_auc_score(y_test, yhat.round())
    print('ROC AUC: %f' % auc)
    # confusion matrix
    matrix = confusion_matrix(y_test, yhat.round())
    print(matrix)
    
    return accuracy,precision,recall,f1,jaccard,kappa,auc,matrix

In [None]:
# Plot Receiving Operating Characteristic Curve
    # Create true and false positive rates
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, yhat)
print('roc_auc_score: ', roc_auc_score(y_test, yhat))
# Plot ROC curves
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic-Random Forest')
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


In [None]:
# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix- Random Forest", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(rf_random, x_test, y_test,
                                 cmap=plt.cm.Blues)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
df2 = pd.DataFrame(index=['KNN','Logistic Regression','Random Forest', ], 
                  columns=['Accuracy','Precision','Recall','Kappa','Jaccard', 'F1-score', 'ROC','Confusion Matrix', 'LogLoss'])

# -------------------------------------------TABLE CREATION--------------------------------------------------------

# -------------------------------------------------KNN----------------------------------------------------------------
yhat = knn_model.predict(x_test)
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat.round())
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat.round())
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat.round())
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat.round(),'weighted')
print('F1 score: %f' % f1)
# Jaccard Index
jaccard=jaccard_score(y_test, yhat.round(),'weighted')
print('Jaccard: %f' % jaccard)
# kappa
kappa = cohen_kappa_score(y_test, yhat.round())
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(y_test, yhat.round())
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(y_test, yhat.round())
print(matrix)

df2.loc['KNN'] = [accuracy, precision, recall, kappa, jaccard, f1, auc, matrix, np.nan]



# ----------------------------------------------------LOGISTIC REGRESSION ---------------------------------------------------------

yhat = lr_model.predict(x_test)
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat.round())
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat.round())
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat.round())
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat.round(),'weighted')
print('F1 score: %f' % f1)
# Jaccard Index
jaccard=jaccard_score(y_test, yhat.round(),'weighted')
print('Jaccard: %f' % jaccard)
# kappa
kappa = cohen_kappa_score(y_test, yhat.round())
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(y_test, yhat.round())
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(y_test, yhat.round())
print(matrix)
yhat_prob = lr_model.predict_proba(x_test)
ll = log_loss(y_test, yhat_prob)
df2.loc['Logistic Regression'] = [accuracy, precision, recall, kappa, jaccard, f1, auc, matrix,ll]

# ----------------------------------------------------- RANDOM FOREST ---------------------------------------------------------
yhat = rf_random.predict(x_test)
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat.round())
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat.round())
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat.round())
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat.round(),'weighted')
print('F1 score: %f' % f1)
# Jaccard Index
jaccard=jaccard_score(y_test, yhat.round(),'weighted')
print('Jaccard: %f' % jaccard)
# kappa
kappa = cohen_kappa_score(y_test, yhat.round())
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(y_test, yhat.round())
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(y_test, yhat.round())
print(matrix)
df2.loc['Random Forest'] = [accuracy, precision, recall, kappa, jaccard, f1, auc, matrix,np.nan]




In [None]:
df2

## ML MODELS FOR SELECTED ATTRIBUTES

Here we have excluded the attributes Age, Pregnancy, Diabetes Pedigree Function and Skin Thickness as these do not contribute much to the detecting Diabetes from our analysis.

Let's see how the accuracy varies from the previous models.

In [None]:
data = df1.copy(deep = True)
data.drop(data.columns[[ 0 , 3 , 6 , 7 ]], axis = 1 , inplace = True )
data.head()

In [None]:
x1 = data.iloc[:, :-1].values
y1 = data.iloc[:, -1].values

In [None]:
x1

In [None]:
y1

In [None]:
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.3, random_state = 4)
# Scaling to bring values to the same range
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)


print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)


In [None]:
########################################### KNN MODEL ############################################################
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
for n in range(1,Ks):
    neigh = KNeighborsClassifier(n_neighbors = n).fit(x_train,y_train)
    yhat = neigh.predict(x_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
    std_acc[n-1] = np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

In [None]:
plt.plot(range(1,Ks),mean_acc,'m')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.tight_layout()
plt.show()

In [None]:
# build model with best accuracy, K=6
knn_model = KNeighborsClassifier(n_neighbors=8).fit(x_train, y_train)
yhat = knn_model.predict(x_test)
mean = metrics.accuracy_score(y_test, yhat)
mean

In [None]:
######################################################## LOGISTIC REGRESSION ##################################################

solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
regularisations = [1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001]
solver_mean_acc = {}
solver_std_acc = {}
solver_best_reg = {}
for solver in solvers:
    best_mean = 0
    best_std = 0
    best_reg = 0
    for reg in regularisations:
        lr = LogisticRegression(C=reg, solver=solver).fit(x_train, y_train)
        yhat = lr.predict(x_test)
        mean = metrics.accuracy_score(y_test, yhat)
        std = np.std(yhat==y_test)/np.sqrt(yhat.shape[0])
        if mean > best_mean:
            best_mean = mean
            best_std = std
            best_reg = reg
    solver_mean_acc[solver] = best_mean
    solver_std_acc[solver] = best_std
    solver_best_reg[solver] = best_reg

solver_mean_acc

In [None]:
solver_best_reg

In [None]:
lr_model = LogisticRegression(C=0.3, solver='saga', max_iter=200).fit(x_train, y_train)
yhat = lr_model.predict(x_test)
mean = metrics.accuracy_score(y_test, yhat)
mean

In [None]:
############################################### RANDOM FOREST ##################################################################
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
def evaluate(model, x_test, y_test):
    yhat = model.predict(x_test)
   # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(y_test, yhat.round())
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(y_test, yhat.round())
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(y_test, yhat.round())
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_test, yhat.round(),'weighted')
    print('F1 score: %f' % f1)
    # Jaccard Index
    jaccard=jaccard_score(y_test, yhat.round(),'weighted')
    print('Jaccard: %f' % jaccard)
    # kappa
    kappa = cohen_kappa_score(y_test, yhat.round())
    print('Cohens kappa: %f' % kappa)
    # ROC AUC
    auc = roc_auc_score(y_test, yhat.round())
    print('ROC AUC: %f' % auc)
    # confusion matrix
    matrix = confusion_matrix(y_test, yhat.round())
    print(matrix)
    
    return accuracy,precision,recall,f1,jaccard,kappa,auc,matrix

In [None]:
df3 = pd.DataFrame(index=['KNN','Logistic Regression','Random Forest', ], 
                  columns=['Accuracy','Precision','Recall','Kappa','Jaccard', 'F1-score', 'ROC','Confusion Matrix', 'LogLoss'])

# -------------------------------------------TABLE CREATION--------------------------------------------------------

# -------------------------------------------------KNN----------------------------------------------------------------
yhat = knn_model.predict(x_test)
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat.round())
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat.round())
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat.round())
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat.round(),'weighted')
print('F1 score: %f' % f1)
# Jaccard Index
jaccard=jaccard_score(y_test, yhat.round(),'weighted')
print('Jaccard: %f' % jaccard)
# kappa
kappa = cohen_kappa_score(y_test, yhat.round())
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(y_test, yhat.round())
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(y_test, yhat.round())
print(matrix)

df3.loc['KNN'] = [accuracy, precision, recall, kappa, jaccard, f1, auc, matrix, np.nan]



# ----------------------------------------------------LOGISTIC REGRESSION ---------------------------------------------------------

yhat = lr_model.predict(x_test)
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat.round())
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat.round())
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat.round())
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat.round(),'weighted')
print('F1 score: %f' % f1)
# Jaccard Index
jaccard=jaccard_score(y_test, yhat.round(),'weighted')
print('Jaccard: %f' % jaccard)
# kappa
kappa = cohen_kappa_score(y_test, yhat.round())
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(y_test, yhat.round())
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(y_test, yhat.round())
print(matrix)
yhat_prob = lr_model.predict_proba(x_test)
ll = log_loss(y_test, yhat_prob)
df3.loc['Logistic Regression'] = [accuracy, precision, recall, kappa, jaccard, f1, auc, matrix,ll]

# ----------------------------------------------------- RANDOM FOREST ---------------------------------------------------------
yhat = rf_random.predict(x_test)
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat.round())
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat.round())
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat.round())
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat.round(),'weighted')
print('F1 score: %f' % f1)
# Jaccard Index
jaccard=jaccard_score(y_test, yhat.round(),'weighted')
print('Jaccard: %f' % jaccard)
# kappa
kappa = cohen_kappa_score(y_test, yhat.round())
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(y_test, yhat.round())
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(y_test, yhat.round())
print(matrix)
df3.loc['Random Forest'] = [accuracy, precision, recall, kappa, jaccard, f1, auc, matrix,np.nan]




In [None]:
df3

In [None]:
df2

**Here we can see that the accuracy difference between all the attributes and the ones with the selected attributes as displayed in the table shown in df3 and df2**