In [21]:
# Dealing with missing data
# create simple example data frame from a csv

import pandas as pd
from io import StringIO
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
0.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data)) # make string file-like
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [22]:
df.isnull().sum() # number of missing values per column

A    0
B    0
C    1
D    1
dtype: int64

In [23]:
# Eliminate samples or features with missing values

df.dropna() # drop samples (rows)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [24]:
# only drop rows where all columns are NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [25]:
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [26]:
# only drop rows where NaN appear in specific columns (here : 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,0.0,11.0,12.0,


In [28]:
# Imputing missing values

# Mean imputation - replace missing value by the mean value of the entire feature column

from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0) # axis=0 for column, axis=1 for row
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [  0. ,  11. ,  12. ,   6. ]])

In [None]:
# Understanding the scikit-learn estimator API