In [1]:
import pandas as pd
from numpy import nan
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
dataset = pd.read_csv("diabetes.csv")

In [3]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


*In the above dataset it can be noted that there are many values which are '0'. These value's can adversely affect the accuracy of our model. Hence it is necessary to replace these with some meaningful value's.

### Let's count the number of missing value in each row 

In [4]:
num_missing_val = (dataset[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]==0).sum()

# Have excluded the Outcome row as in that row 1 denotes True and 0 denotes False

In [5]:
print(num_missing_val)

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64


* These are the total number of missing values present in the dataset.

### We will convert these 0 to nan

In [6]:
dataset[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']] = dataset[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']].replace(0,nan)

In [7]:
dataset.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,,33.6,0.627,50,1
1,1.0,85.0,66.0,29.0,,26.6,0.351,31,0
2,8.0,183.0,64.0,,,23.3,0.672,32,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5.0,116.0,74.0,,,25.6,0.201,30,0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10.0,115.0,,,,35.3,0.134,29,0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8.0,125.0,96.0,,,,0.232,54,1


### Techniques of dealing with missing data

There are a few techniques which can help you deal with missing values in your data set —

* Drop missing values/columns/rows

* Imputation

      - A constant value that belongs to the set of possible values of that variable
      
      - A mean, median or mode value for the column
      
      - A value estimated by another predictive model
      
      - Multiple Imputation

In [8]:
# Have used one of the imputation technique to fill the the missing values with the mean of the dataset.

dataset.fillna(dataset.mean(), inplace=True) 

In [9]:
dataset.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,4.494673,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5.0,116.0,74.0,29.15342,155.548223,25.6,0.201,30,0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10.0,115.0,72.405184,29.15342,155.548223,35.3,0.134,29,0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8.0,125.0,96.0,29.15342,155.548223,32.457464,0.232,54,1
