In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

In [2]:
data = pd.read_csv('Pima Indians Diabetes Database.csv')
data.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## We'll refine the data first and take only important columns but here everything looks reasonable

## We'll now check for missing values

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Number of times pregnant                                                    768 non-null int64
Plasma glucose concentration a 2 hours in an oral glucose tolerance test    768 non-null int64
Diastolic blood pressure (mm Hg)                                            768 non-null int64
Triceps skin fold thickness (mm)                                            768 non-null int64
2-Hour serum insulin (mu U/ml)                                              768 non-null int64
Body mass index (weight in kg/(height in m)^2)                              768 non-null float64
Diabetes pedigree function                                                  768 non-null float64
Age (years)                                                                 768 non-null int64
Class variable (0 or 1)                                                     768 non-null int64
dtypes: float64(2), int64(7)
memory u

In [4]:
data.isnull().sum()

Number of times pregnant                                                    0
Plasma glucose concentration a 2 hours in an oral glucose tolerance test    0
Diastolic blood pressure (mm Hg)                                            0
Triceps skin fold thickness (mm)                                            0
2-Hour serum insulin (mu U/ml)                                              0
Body mass index (weight in kg/(height in m)^2)                              0
Diabetes pedigree function                                                  0
Age (years)                                                                 0
Class variable (0 or 1)                                                     0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Although data didn't show any missing value but we can see from descriptive statistics that some values are 0 so we will convert them to NaN and then proceed further

### We'll convert 0 to NaN values where required

In [6]:
cols = ['Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', 
     '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 
     'Age (years)']
data[cols] = data[cols].replace(0,np.nan)
data.isnull().sum()

Number of times pregnant                                                      0
Plasma glucose concentration a 2 hours in an oral glucose tolerance test      5
Diastolic blood pressure (mm Hg)                                             35
Triceps skin fold thickness (mm)                                            227
2-Hour serum insulin (mu U/ml)                                              374
Body mass index (weight in kg/(height in m)^2)                               11
Diabetes pedigree function                                                    0
Age (years)                                                                   0
Class variable (0 or 1)                                                       0
dtype: int64

### Some algoritms do not work with missing values so we have to either impute or drop these values
### Let's take example of LDA, it doesn't work with missing values

In [7]:
values = data.values
X = values[:,0:8]
y = values[:,8]

In [8]:
X.shape

(768, 8)

In [9]:
y.shape

(768,)

## Performing LDA and KFold

In [10]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [11]:
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, X, y, cv=kfold, scoring = 'accuracy')
result.mean()



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## As you can see it throws an error 'Input contains NaN, infinity or a value too large for dtype('float64').' hence we have to either eliminate missing values or impute them with some other value



## We'll see how to remove missing values now

In [12]:
data_no_missing = data.copy()
data_no_missing.dropna(inplace=True)
data_no_missing.isnull().sum()

Number of times pregnant                                                    0
Plasma glucose concentration a 2 hours in an oral glucose tolerance test    0
Diastolic blood pressure (mm Hg)                                            0
Triceps skin fold thickness (mm)                                            0
2-Hour serum insulin (mu U/ml)                                              0
Body mass index (weight in kg/(height in m)^2)                              0
Diabetes pedigree function                                                  0
Age (years)                                                                 0
Class variable (0 or 1)                                                     0
dtype: int64

In [13]:
data_no_missing.shape

(392, 9)

### As you can see the size of data reduced a lot after dropping all the missing values which is not the ideal scenario as it is too much data loss, still we'll try to predict the acuracy of model after using LDA

In [14]:
values = data_no_missing.values
X = values[:,0:8]
y = values[:,8]
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, X, y, cv=kfold, scoring = 'accuracy')
result.mean()

0.7858289293403797

## The model is now 78.5 % accurate, let's try and impute the values and check the accuracy of model again

## Imputing with mean

In [15]:
data_imputed = data.copy()

In [16]:
## using pandas we can impute as 
## data_imputed.fillna(data_imputed.mean(),inplace=True)

## using sklearn
from sklearn.preprocessing import Imputer

values = data_imputed.values
X = values[:,0:8]
y = values[:,8]

imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis=0)
transformed_values = imputer.fit_transform(values)

## for strategy as mode we give 'most_frequent'

np.isnan(transformed_values).sum()





0

### As you can see all NaN values have been replaced by mean

In [17]:
X = transformed_values[:,0:8]
y = transformed_values[:,8]

model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, X, y, cv=kfold, scoring = 'accuracy')
result.mean()

0.7669270833333334

### The accuracy of model decreased to 76.6 % after replacing Nan with mean, rather than 78.5 % when those missing values were dropped. We will try another strategy and se if we can increase the accuracy of model

## Trying a different library for imputing now, and we'll impute the values by 0 first and check the accuracy

In [18]:
data_imputed = data.copy()

from sklearn.impute import SimpleImputer

values = data_imputed.values
X = values[:,0:8]
y = values[:,8]

imputer = SimpleImputer(missing_values = np.nan, strategy = 'constant', fill_value = 0)
transformed_values = imputer.fit_transform(values)

## for strategy as mode we give 'most_frequent'

np.isnan(transformed_values).sum()

X = transformed_values[:,0:8]
y = transformed_values[:,8]

model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, X, y, cv=kfold, scoring = 'accuracy')
result.mean()


0.7734375

### As you can see the accuracy increased to 77.3 %, hence 0 is better than mean

### Imputing with mode

In [19]:
data_imputed = data.copy()

values = data_imputed.values
X = values[:,0:8]
y = values[:,8]

imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
transformed_values = imputer.fit_transform(values)

np.isnan(transformed_values).sum()

X = transformed_values[:,0:8]
y = transformed_values[:,8]

model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, X, y, cv=kfold, scoring = 'accuracy')
result.mean()

0.7643229166666666

### Accuracy decreased to 76.4 %

### Imputing with median

In [20]:
data_imputed = data.copy()

values = data_imputed.values
X = values[:,0:8]
y = values[:,8]

imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
transformed_values = imputer.fit_transform(values)

np.isnan(transformed_values).sum()

X = transformed_values[:,0:8]
y = transformed_values[:,8]

model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, X, y, cv=kfold, scoring = 'accuracy')
result.mean()

0.7682291666666666

### Accuracy is 76.8 %

## Below are the accuracies for different imputes

### Accuracy after dropping all missing values                                      = 78.5 %
### Accuracy after replacing missing values with Mean                        = 76.6 %
### Accuracy after replacing missing values with Median                     = 76.8 %
### Accuracy after replacing missing values with Mode                        = 76.4 %
### Accuracy after replacing missing values with 0                               = 77.3 %

## For this particular dataset dropping the missing values has highest accuracy but the margin is not too big so maybe missing values maybe be decisive

## One approach is to use assert to check if there are no unnecessary 0 values

In [21]:
assert pd.notnull(data).all().all()

AssertionError: 

## Assert will return nothing if the Assert statement is True otherwise it will return False and an Assertion Error

## KNN imputer for handling missing values

In [22]:
cols = ['Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', 
     '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 
     'Age (years)']
data_with_nan = data.copy()
data_with_nan[cols] = data[cols].replace(0,np.nan)
data_with_nan.isnull().sum()

Number of times pregnant                                                      0
Plasma glucose concentration a 2 hours in an oral glucose tolerance test      5
Diastolic blood pressure (mm Hg)                                             35
Triceps skin fold thickness (mm)                                            227
2-Hour serum insulin (mu U/ml)                                              374
Body mass index (weight in kg/(height in m)^2)                               11
Diabetes pedigree function                                                    0
Age (years)                                                                   0
Class variable (0 or 1)                                                       0
dtype: int64

In [23]:
from missingpy import KNNImputer
values = data_with_nan.values
X = values[:,0:8]
y = values[:,8]

imputer = KNNImputer(n_neighbors=3)
transformed_data = imputer.fit_transform(X)

In [24]:
transformed_data

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [25]:
X = transformed_values[:,0:8]
y = transformed_values[:,8]


model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, X, y, cv=kfold, scoring = 'accuracy')
result.mean()

0.7682291666666666

## Even after predicting the model using KNN imputer the accuracy is 76.8% 
### Let's try some variation in KNN prediction and se if the accuracy increases

In [26]:
values = data_with_nan.values
X = values[:,0:8]
y = values[:,8]

imputer = KNNImputer(missing_values='NaN', n_neighbors=2, weights='uniform', metric='masked_euclidean')
transformed_data = imputer.fit_transform(X)

X = transformed_values[:,0:8]
y = transformed_values[:,8]


model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, X, y, cv=kfold, scoring = 'accuracy')
result.mean()

0.7682291666666666