From `https://machinelearningmastery.com/handle-missing-data-python/`

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#first row is data (not header)
df = pd.read_csv('./data/pima-indians-diabetes.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Make the dataset a little more readable ##

In [3]:
'''
   1. Number of times pregnant
   2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
   3. Diastolic blood pressure (mm Hg)
   4. Triceps skin fold thickness (mm)
   5. 2-Hour serum insulin (mu U/ml)
   6. Body mass index (weight in kg/(height in m)^2)
   7. Diabetes pedigree function
   8. Age (years)
   9. Class variable (0 or 1)
'''
df.rename(columns={0:'times_preg', 1:'plasma_glucose', 2:'blood_pressure', 3:'skin_thickness', 4:'serum_insulin', \
                   5:'bmi', 6:'pedigree_fn', 7:'age', 8:'result'}, inplace=True)
df.head()

Unnamed: 0,times_preg,plasma_glucose,blood_pressure,skin_thickness,serum_insulin,bmi,pedigree_fn,age,result
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Explore the dataset a little bit #

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   times_preg      768 non-null    int64  
 1   plasma_glucose  768 non-null    int64  
 2   blood_pressure  768 non-null    int64  
 3   skin_thickness  768 non-null    int64  
 4   serum_insulin   768 non-null    int64  
 5   bmi             768 non-null    float64
 6   pedigree_fn     768 non-null    float64
 7   age             768 non-null    int64  
 8   result          768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
df.isnull().sum()

times_preg        0
plasma_glucose    0
blood_pressure    0
skin_thickness    0
serum_insulin     0
bmi               0
pedigree_fn       0
age               0
result            0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,times_preg,plasma_glucose,blood_pressure,skin_thickness,serum_insulin,bmi,pedigree_fn,age,result
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
df.head()

Unnamed: 0,times_preg,plasma_glucose,blood_pressure,skin_thickness,serum_insulin,bmi,pedigree_fn,age,result
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Start cleaning data #

## Zero does not make sense in these columns/features ##

-plasma_glucose <br/>
-blood_pressure <br/>
-skin_thickness <br/>
-serum_insulin <br/>
-bmi

In [8]:
selected_columns = ['plasma_glucose', 'blood_pressure', 'skin_thickness', 'serum_insulin', 'bmi']
num_missing = (df[selected_columns] == 0).sum()
print(num_missing)
print(f'Total Columns: {len(df)}')

plasma_glucose      5
blood_pressure     35
skin_thickness    227
serum_insulin     374
bmi                11
dtype: int64
Total Columns: 768


Mark the zeros/0 as NaN to they're not treated as actual integer values

In [9]:
df[selected_columns] = df[selected_columns].replace({0:np.nan})

In [10]:
print(df.isnull().sum())

times_preg          0
plasma_glucose      5
blood_pressure     35
skin_thickness    227
serum_insulin     374
bmi                11
pedigree_fn         0
age                 0
result              0
dtype: int64


In [11]:
df.shape

(768, 9)

### Try crude method of dropping all rows with null data and check results ###

In [12]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [13]:
df_drop = df.copy()
df_drop.dropna(inplace=True)
df_values = df_drop.values

df_drop.shape

(392, 9)

In [14]:
X = df_values[:,0:8]
X.shape

(392, 8)

In [15]:
y = df_values[:,8]
y.shape

(392,)

In [17]:
# define the model
model = LinearDiscriminantAnalysis()

# define the model evaluation procedure
cv = KFold(n_splits=10, shuffle=True, random_state=1)

# evaluate the model
drop_null_result = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
drop_null_accuracy = round(drop_null_result.mean(), 3)
# report the mean performance
print(f'drop_null_accuracy: {drop_null_accuracy*100}%')

drop_null_accuracy: 78.3%
