# Handle missing values in training set.
# For that lets load a sample csv data and see different options to handle missing values in it

In [2]:
import pandas as pd
from io import StringIO
csv_data = '''A,B,C,D
1,2,3,4
5,6,,8
9,,,12
13,14,15,16'''

df = pd.read_csv(StringIO(unicode(csv_data)))
print df

    A     B     C   D
0   1   2.0   3.0   4
1   5   6.0   NaN   8
2   9   NaN   NaN  12
3  13  14.0  15.0  16


# Different options to drop missing values

In [3]:
df.isnull().sum()

A    0
B    1
C    2
D    0
dtype: int64

In [4]:
# drop rows where atleast one column is NaN
df.dropna()

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,4
3,13,14.0,15.0,16


In [5]:
# drop rows that have all columns NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,4
1,5,6.0,,8
2,9,,,12
3,13,14.0,15.0,16


In [6]:
# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,4
3,13,14.0,15.0,16


In [73]:
# drop columns that have at least one NaN in any row by setting the axis argument to 1
df.dropna(axis=1)

Unnamed: 0,A,D
0,1,4
1,5,8
2,9,12
3,13,16


# Imputing missing values
Replace missing value by mean OR median OR most-frequent value of the entire feature column..

In [78]:
from sklearn.preprocessing import Imputer

#Mean IMR
mean_imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
mean_imr = mean_imr.fit(df)
mean_imputed_df = mean_imr.transform(df.values)
print "Mean imputed DF:\n", mean_imputed_df

# MEDIAN IMR
median_imr = Imputer(missing_values='NaN', strategy='median', axis=0)
median_imr = median_imr.fit(df)
median_imputed_df = median_imr.transform(df.values)
print "Median imputed DF:\n", median_imputed_df

# MOST_FREQUENT IMR
most_frequent_imr = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
most_frequent_imr = most_frequent_imr.fit(df)
most_frequent_imputed_df = most_frequent_imr.transform(df.values)
print "Most Frequent imputed DF:\n", most_frequent_imputed_df

Mean imputed DF:
[[  1.           2.           3.           4.        ]
 [  5.           6.           9.           8.        ]
 [  9.           7.33333333   9.          12.        ]
 [ 13.          14.          15.          16.        ]]
Median imputed DF:
[[  1.   2.   3.   4.]
 [  5.   6.   9.   8.]
 [  9.   6.   9.  12.]
 [ 13.  14.  15.  16.]]
Most Frequent imputed DF:
[[  1.   2.   3.   4.]
 [  5.   6.   3.   8.]
 [  9.   2.   3.  12.]
 [ 13.  14.  15.  16.]]
