# cleaning data and dealing with missing values 

#### the dataset here is one of Pima Indians medical details in order to predict which ones will be diabetic within the next five years

import the required packages

In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math 
import scipy as sp

save the dataset in a dataframe "med" and add column headings for clarity

key for column headings: 
- p = no. of times pregnant
- pgc = plasma glucose concentration
- bp = blood pressure
- tst = triceps skinfold thickness
- s = serum insulin
- bmi = body mass index
- d = diabetes pedigree function
- age = age
- class = class

In [48]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
med = pd.read_csv(url, names = ["p", "pgc", "bp", "tst", "s", "bmi", "d", "age", "class"])
med.head()

Unnamed: 0,p,pgc,bp,tst,s,bmi,d,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [49]:
#check for missing values

med.describe()

Unnamed: 0,p,pgc,bp,tst,s,bmi,d,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


in some cases, it makes sense for columns to have a minumum value for 0. but for the following colums it doesn't make sense:
- plasma glucose concentration 
- blood pressure
- triceps skinfold thickness
- serum insulin
- body mass index

hence we can safely assume that the instances of 0s in these columns imply missing values

In [51]:
print((med[["pgc", "bp", "tst", "s", "bmi"]] == 0).sum())

pgc      5
bp      35
tst    227
s      374
bmi     11
dtype: int64


so, pgc, bp and bmi have very few missing values as compared to tst and s. we should mark these values as NaN so that they are left out of operations performed on the data. 

### marking the missing values

In [53]:
med[["pgc", "bp", "tst", "s", "bmi"]] = med[["pgc", "bp", "tst", "s", "bmi"]].replace(0, np.NaN)
print(med.isnull().sum())

p          0
pgc        5
bp        35
tst      227
s        374
bmi       11
d          0
age        0
class      0
dtype: int64


In [54]:
med.head(20)

Unnamed: 0,p,pgc,bp,tst,s,bmi,d,age,class
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,,,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,,,,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,,,,0.232,54,1


there are two ways to deal with missing values:
- remove the rows with missing values from the dataset
- impute missing values

### removing rows

In [55]:
print("original shape: ")
print(med.shape)
med.dropna(inplace = True)
print("after removing rows: ")
print(med.shape)

original shape: 
(768, 9)
after removing rows: 
(392, 9)


now we can use algorithms that are NaN - value sensitive such as LDA

import the required functions

In [59]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score as cvs

In [58]:
#split the data into inputs and target variable 

y = med["class"]
values = med.values
x = values[: ,0:8]


In [60]:
# evaluate an LDA model using k-fold cross validation

model = lda()
kfold = KFold(n_splits = 3, random_state = 7)
result = cvs(model, x, y, cv = kfold, scoring = 'accuracy')
print(result.mean())

0.78582892934


### imputing missing values 

there are different values to impute missing values with: 
- 0 or any other value present in the data
- the mean, median or mode value of the column
- a value estimated by another predictive model 

#### important - any imputing performed on the test data has to be performed on the new data when predictions are required


In [61]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
med = pd.read_csv(url, names = ["p", "pgc", "bp", "tst", "s", "bmi", "d", "age", "class"])
med[["pgc", "bp", "tst", "s", "bmi"]] = med[["pgc", "bp", "tst", "s", "bmi"]].replace(0, np.NaN)
med.fillna(med.mean(), inplace = True)

#count the number of NaN values now
print(med.isnull().sum())

p        0
pgc      0
bp       0
tst      0
s        0
bmi      0
d        0
age      0
class    0
dtype: int64


In [62]:
med.head(20)

Unnamed: 0,p,pgc,bp,tst,s,bmi,d,age,class
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,29.15342,155.548223,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.405184,29.15342,155.548223,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,29.15342,155.548223,32.457464,0.232,54,1


In [63]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
med = pd.read_csv(url, names = ["p", "pgc", "bp", "tst", "s", "bmi", "d", "age", "class"])
med[["pgc", "bp", "tst", "s", "bmi"]] = med[["pgc", "bp", "tst", "s", "bmi"]].replace(0, np.NaN)

from sklearn.preprocessing import Imputer
values = med.values
imp = Imputer()
trans_values = imp.fit_transform(values)
print(np.isnan(trans_values).sum())

0


In [67]:
y = med["class"]
values = med.values
x = values[: ,0:8]

imp = Imputer()
trans_x = imp.fit_transform(x)

In [73]:
# evaluate an LDA model using k-fold cross validation

model = lda()
kfold = KFold(n_splits = 4, random_state = 7)
result = cvs(model, trans_x, y, cv = kfold, scoring = 'accuracy')
print(result)
print(result.mean())

[ 0.765625    0.70833333  0.80729167  0.79166667]
0.768229166667


In [74]:
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [78]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(trans_x, y)

In [79]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [80]:
pred = lr.predict(x_test)
print(pred)

[0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1
 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1
 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0
 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
 0 0 0 0 0 0 1]


In [81]:
print(y_test)

534    0
700    0
462    0
370    1
184    0
72     1
432    0
27     0
679    0
74     0
249    0
401    0
283    1
315    0
755    1
733    0
148    0
38     1
398    0
695    1
617    0
282    0
638    1
464    0
709    1
657    0
429    1
403    0
124    1
73     0
      ..
665    0
236    1
454    0
688    0
565    0
701    1
598    1
634    0
763    0
600    0
567    0
364    0
478    0
395    0
557    0
150    0
187    1
42     0
741    0
95     0
545    1
439    0
596    0
127    0
81     0
437    0
644    0
742    0
34     0
125    1
Name: class, Length: 192, dtype: int64
