# STEP1: IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF


from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from pandas.plotting import scatter_matrix

# STEP2: LOAD THE DATASET

In [2]:
df= pd.read_csv("diabetes.csv")

# Check zero values and replace them with mean values

In [3]:
#checking
print('No. of zero values in Glucose ', df[df['Glucose']==0].shape[0])
print('No. of zero values in BloodPressure ', df[df['BloodPressure']==0].shape[0])
print('No. of zero values in SkinThickness ', df[df['SkinThickness']==0].shape[0])
print('No. of zero values in Insulin ', df[df['Insulin']==0].shape[0])
print('No. of zero values in BMI ', df[df['BMI']==0].shape[0])

No. of zero values in Glucose  5
No. of zero values in BloodPressure  35
No. of zero values in SkinThickness  227
No. of zero values in Insulin  374
No. of zero values in BMI  11


In [4]:
#replace
df['Glucose'] = df['Glucose'].replace(0,df['Glucose'].mean())
df['BloodPressure'] = df['BloodPressure'].replace(0,df['BloodPressure'].mean())
df['SkinThickness'] = df['SkinThickness'].replace(0,df['SkinThickness'].mean())
df['Insulin'] = df['Insulin'].replace(0,df['Insulin'].mean())
df['BMI'] = df['BMI'].replace(0,df['BMI'].mean())

In [5]:
#Again checking
print('No. of zero values in Glucose ', df[df['Glucose']==0].shape[0])
print('No. of zero values in BloodPressure ', df[df['BloodPressure']==0].shape[0])
print('No. of zero values in SkinThickness ', df[df['SkinThickness']==0].shape[0])
print('No. of zero values in Insulin ', df[df['Insulin']==0].shape[0])
print('No. of zero values in BMI ', df[df['BMI']==0].shape[0])

No. of zero values in Glucose  0
No. of zero values in BloodPressure  0
No. of zero values in SkinThickness  0
No. of zero values in Insulin  0
No. of zero values in BMI  0


# Data Cleaning

In [7]:
df.shape

(768, 9)

In [8]:
q = df['Pregnancies'].quantile(0.98)
# we are removing the top 2% data from the Pregnancies column
data_cleaned = df[df['Pregnancies']<q]
q = data_cleaned['BMI'].quantile(0.99)
# we are removing the top 1% data from the BMI column
data_cleaned  = data_cleaned[data_cleaned['BMI']<q]
q = data_cleaned['SkinThickness'].quantile(0.99)
# we are removing the top 1% data from the SkinThickness column
data_cleaned  = data_cleaned[data_cleaned['SkinThickness']<q]
q = data_cleaned['Insulin'].quantile(0.95)
# we are removing the top 5% data from the Insulin column
data_cleaned  = data_cleaned[data_cleaned['Insulin']<q]
q = data_cleaned['DiabetesPedigreeFunction'].quantile(0.99)
# we are removing the top 1% data from the DiabetesPedigreeFunction column
data_cleaned  = data_cleaned[data_cleaned['DiabetesPedigreeFunction']<q]
q = data_cleaned['Age'].quantile(0.99)
# we are removing the top 1% data from the Age column
dfc  = data_cleaned[data_cleaned['Age']<q]

In [9]:
dfc.shape

(674, 9)

# SPLIT THE DATA FRAME INTO x AND y

In [10]:
target_name = 'Outcome'
y= dfc[target_name]
X= dfc.drop(target_name, axis=1)

# APPLY FEATURE SCALING

In [11]:
scaler = StandardScaler()
scaler.fit(X)
SSX= scaler.transform(X)

In [12]:
SSX

array([[ 7.96753910e-01,  9.83984062e-01,  4.52611463e-04, ...,
         2.65819648e-01,  6.30484542e-01,  1.60141519e+00],
       [-8.64793539e-01, -1.16977621e+00, -5.04474494e-01, ...,
        -8.31445036e-01, -3.38078670e-01, -1.32706484e-01],
       [ 1.46137289e+00,  2.18051755e+00, -6.72783529e-01, ...,
        -1.34872696e+00,  7.88402456e-01, -4.14369227e-02],
       ...,
       [ 4.64444420e-01,  6.09439465e-02,  4.52611463e-04, ...,
        -8.94145875e-01, -7.10063091e-01, -2.23976046e-01],
       [-8.64793539e-01,  2.31877301e-01, -1.00940160e+00, ...,
        -2.82812694e-01, -3.45097244e-01,  1.32760650e+00],
       [-8.64793539e-01, -8.96282840e-01, -1.67856424e-01, ...,
        -2.35787064e-01, -4.64413001e-01, -8.62862978e-01]])

# TRAIN-TEST SPLIT

In [14]:
X_train, X_test, y_train, y_test = train_test_split(SSX, y, test_size=0.25, random_state=355)

In [15]:
X_train.shape, y_train.shape

((505, 8), (505,))

In [16]:
X_test.shape, y_test.shape

((169, 8), (169,))

# BUILDING THE CLASSIFICATION ALGORITHM, 
# MODEL PREDICTION AND 
# MODEL EVALUATION

### 1. Logistic Regression

In [176]:
lr= LogisticRegression(C=1.0,penalty='l2',solver='lbfgs',random_state=32,max_iter=100)
lr.fit(X_train, y_train)

In [177]:
lr_predict= lr.predict(X_test)

In [184]:
print("Train accuracy of Logistic Regression: ", lr.score(X_train, y_train)*100)
print("Accuracy (Test) score of Logistic Regression: ", accuracy_score(y_test, lr_predict)*100)
print("Misclassification rate of Logistic Regression: ", 100-(accuracy_score(y_test, lr_predict)*100))

Train accuracy of Logistic Regression:  77.42574257425743
Accuracy (Test) score of Logistic Regression:  83.4319526627219
Misclassification rate of Logistic Regression:  16.568047337278102


### 2. K Nearest Neighbours Classifier(KNN)

In [304]:
knn = KNeighborsClassifier(n_neighbors=100,weights='distance',algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
knn.fit(X_train, y_train)

In [305]:
knn_predict= knn.predict(X_test)

In [307]:
print("Train accuracy of KNN: ", knn.score(X_train, y_train)*100)
print("Accuracy (Test) score of KNN: ", accuracy_score(y_test, knn_predict)*100)
print("Misclassification rate of KNN: ", 100-(accuracy_score(y_test, knn_predict)*100))

Train accuracy of KNN:  100.0
Accuracy (Test) score of KNN:  78.69822485207101
Misclassification rate of KNN:  21.301775147928993
