na_values : scalar, str, list-like, or dict, default None
    Additional strings to recognize as NA/NaN. If dict passed, specific
    per-column NA values.  By default the following values are interpreted as
    NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
    '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan',
    'null'.

In [5]:
import numpy as np 
import pandas as pd

df = pd.read_csv('diabetes_null.csv',na_values = ['#NAME?'])

In [6]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,5,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,4.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,,,25.6,0.210,3,0
6,3,78.0,5.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,,,,35.3,0.134,29,0
8,2,197.0,7.0,45.0,543.0,3.5,0.158,53,1
9,8,125.0,96.0,,,,0.232,54,1


In [8]:
df.shape

(768, 9)

In [9]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     763 non-null float64
BloodPressure               733 non-null float64
SkinThickness               541 non-null float64
Insulin                     394 non-null float64
BMI                         757 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(6), int64(3)
memory usage: 54.1 KB


In [11]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,99.310616,55.706685,25.876155,105.659898,29.865654,0.490439,30.990885,0.348958
std,3.369578,55.115381,29.878852,12.967816,116.862508,10.686049,0.34605,13.281475,0.476951
min,0.0,1.0,1.0,1.0,1.0,2.0,0.1,3.0,0.0
25%,1.0,72.5,16.0,18.0,21.0,25.4,0.25375,23.0,0.0
50%,3.0,113.0,66.0,27.0,71.0,32.0,0.3825,28.0,0.0
75%,6.0,138.0,76.0,35.0,151.0,36.1,0.6475,38.25,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [12]:
df[df.duplicated()].sum()

Pregnancies                 0.0
Glucose                     0.0
BloodPressure               0.0
SkinThickness               0.0
Insulin                     0.0
BMI                         0.0
DiabetesPedigreeFunction    0.0
Age                         0.0
Outcome                     0.0
dtype: float64

In [13]:
df.drop_duplicates(inplace = True)

In [14]:
df.isna().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [16]:
df.dropna?

subset : array-like, optional
    Labels along other axis to consider, e.g. if you are dropping rows
    these would be a list of columns to include.

In [17]:
df = df.dropna(subset = ['Glucose'])

In [18]:
df.isna().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                35
SkinThickness               227
Insulin                     370
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

## Handling missing data

In [19]:
# Deleting 
df1 = pd.read_csv('diabetes_null.csv',na_values = ['#NAME?'])
#code to see the missing values in descending
df1.isnull().sum().sort_values(ascending = False)
df_no_missing = df1.dropna(axis = 0)
print(df_no_missing.head())

    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
3             1     89.0           66.0           23.0     94.0  28.1   
4             0    137.0            4.0           35.0    168.0  43.1   
6             3     78.0            5.0           32.0     88.0  31.0   
8             2    197.0            7.0           45.0    543.0   3.5   
13            1    189.0            6.0           23.0    846.0   3.1   

    DiabetesPedigreeFunction  Age  Outcome  
3                      0.167   21        0  
4                      2.288   33        1  
6                      0.248   26        1  
8                      0.158   53        1  
13                     0.398   59        1  


In [20]:
df1.isnull().sum().sort_values(ascending = False)

Insulin                     374
SkinThickness               227
BloodPressure                35
BMI                          11
Glucose                       5
Outcome                       0
Age                           0
DiabetesPedigreeFunction      0
Pregnancies                   0
dtype: int64

## Fill with mean

In [22]:
from sklearn.preprocessing import Imputer

In [24]:
df1 = pd.read_csv('diabetes_null.csv',na_values = ['#NAME?'])

#imputer to impute missing values with mean
imp = Imputer(missing_values = 'NaN',strategy = 'mean',axis = 0)

imp.fit(df1)
df_nonull = pd.DataFrame(data = imp.transform(df1),columns = df1.columns)



In [25]:
df_nonull.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,105.659898,33.6,0.627,5.0,1.0
1,1.0,85.0,66.0,29.0,105.659898,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,25.876155,105.659898,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,4.0,35.0,168.0,43.1,2.288,33.0,1.0


In [27]:
df_nonull.isnull().sum().sort_values(ascending = False)

Outcome                     0
Age                         0
DiabetesPedigreeFunction    0
BMI                         0
Insulin                     0
SkinThickness               0
BloodPressure               0
Glucose                     0
Pregnancies                 0
dtype: int64

## Fill with median

In [28]:
df1.isnull().sum().sort_values(ascending = False)

Insulin                     374
SkinThickness               227
BloodPressure                35
BMI                          11
Glucose                       5
Outcome                       0
Age                           0
DiabetesPedigreeFunction      0
Pregnancies                   0
dtype: int64

In [47]:
#imputer to impute missing values with mean
imp = Imputer(missing_values = 'NaN',strategy = 'median',axis = 0)

imp.fit(df1)
df1 = pd.DataFrame(data = imp.transform(df1),columns = df1.columns)

df1.head()



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,71.0,33.6,0.627,5.0,1.0
1,1.0,85.0,66.0,29.0,71.0,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,27.0,71.0,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,4.0,35.0,168.0,43.1,2.288,33.0,1.0


## Handling Data impurities

##### Function to find outliers
Order of Execution
PEDMAS - Paranthesis, Exponential, Division, Mul, Add, Sub

In [42]:
def find_outliers(x):
    q1 = x.quantile(.25)
    q3 = x.quantile(.75)
    iqr = q3-q1 
    lw = q1 - 1.5*iqr
    uw = q3 + 1.5*iqr
    outlier_indices = list(x.index[(x < lw) | (x > uw)])
    outlier_values = list(x[outlier_indices])
    return outlier_indices, outlier_values

## find outliers in each columns

In [44]:
glucose_index, glucose_value = find_outliers(df1['Glucose'])
print(glucose_index)
print(glucose_value)

[]
[]


In [45]:
Pregnancies_index, Pregnancies_value = find_outliers(df1['Pregnancies'])
print(Pregnancies_index)
print(Pregnancies_value)

[88, 159, 298, 455]
[15, 17, 14, 14]


In [48]:
SkinThickness_index, SkinThickness_value = find_outliers(df1['SkinThickness'])
print(SkinThickness_index)
print(SkinThickness_value)

[19, 55, 57, 66, 70, 73, 74, 75, 86, 92, 98, 99, 120, 122, 126, 135, 141, 150, 161, 182, 194, 202, 211, 215, 217, 273, 275, 279, 291, 310, 313, 321, 322, 356, 357, 358, 370, 373, 380, 397, 409, 411, 427, 445, 455, 458, 462, 463, 466, 470, 480, 491, 532, 534, 539, 540, 551, 556, 558, 566, 569, 574, 576, 579, 590, 591, 621, 631, 644, 652, 657, 669, 671, 693, 704, 713, 723, 725, 741, 763]
[3.0, 1.0, 6.0, 3.0, 2.0, 2.0, 3.0, 2.0, 54.0, 4.0, 3.0, 51.0, 56.0, 3.0, 3.0, 2.0, 3.0, 5.0, 4.0, 2.0, 2.0, 2.0, 54.0, 4.0, 3.0, 5.0, 52.0, 1.0, 3.0, 3.0, 1.0, 3.0, 2.0, 4.0, 3.0, 4.0, 48.0, 4.0, 3.0, 4.0, 49.0, 3.0, 3.0, 63.0, 3.0, 48.0, 4.0, 3.0, 1.0, 4.0, 3.0, 3.0, 52.0, 3.0, 49.0, 4.0, 3.0, 4.0, 4.0, 3.0, 3.0, 3.0, 2.0, 99.0, 4.0, 5.0, 2.0, 4.0, 3.0, 4.0, 48.0, 3.0, 1.0, 49.0, 2.0, 2.0, 3.0, 4.0, 2.0, 48.0]


In [49]:
Insulin_index, Insulin_value = find_outliers(df1['Insulin'])
print(Insulin_index)
print(Insulin_value)

[3, 4, 6, 8, 13, 14, 16, 19, 20, 24, 25, 27, 28, 31, 32, 35, 39, 40, 43, 51, 52, 53, 54, 56, 57, 59, 63, 68, 69, 70, 71, 73, 85, 88, 91, 92, 95, 99, 103, 105, 107, 108, 109, 110, 111, 112, 114, 119, 120, 122, 125, 126, 127, 128, 130, 132, 134, 135, 136, 137, 139, 144, 147, 150, 152, 153, 156, 157, 158, 159, 161, 162, 165, 171, 173, 174, 175, 177, 181, 182, 186, 188, 189, 191, 195, 197, 198, 199, 203, 204, 206, 208, 213, 214, 215, 216, 217, 220, 223, 224, 225, 228, 229, 231, 232, 234, 236, 241, 243, 244, 248, 252, 254, 258, 259, 260, 271, 273, 277, 279, 281, 282, 285, 286, 287, 288, 290, 292, 293, 295, 296, 297, 298, 301, 302, 305, 306, 307, 308, 309, 311, 312, 313, 315, 318, 320, 323, 325, 326, 328, 331, 335, 338, 340, 345, 353, 356, 358, 359, 360, 364, 369, 370, 371, 373, 374, 375, 376, 382, 384, 385, 388, 390, 392, 393, 395, 396, 402, 405, 409, 411, 412, 414, 415, 419, 420, 424, 425, 427, 428, 429, 431, 432, 441, 442, 445, 446, 447, 450, 452, 454, 458, 459, 460, 462, 465, 466, 467, 4

In [50]:
## VIF - Variation Inflation Factor

In [52]:
# Dropping the outlier values
del_df = df1.drop(Pregnancies_index)
print(del_df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          6.0    148.0           72.0           35.0     71.0  33.6   
1          1.0     85.0           66.0           29.0     71.0  26.6   
2          8.0    183.0           64.0           27.0     71.0  23.3   
3          1.0     89.0           66.0           23.0     94.0  28.1   
4          0.0    137.0            4.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction   Age  Outcome  
0                     0.627   5.0      1.0  
1                     0.351  31.0      0.0  
2                     0.672  32.0      1.0  
3                     0.167  21.0      0.0  
4                     2.288  33.0      1.0  


#### Replace with min

In [None]:
min_in = np.min(del_df)