This notebook is a first tutorial in series of experiments on preprocessing

In [1]:
import numpy as np

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

rng = np.random.RandomState(0)

In [2]:
dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

# Estimate the score on the entire dataset, with no missing values
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the entire dataset = %.2f" % score)


Score with the entire dataset = 0.56


In [3]:
missing_rate = 0.75
n_missing_samples = int(np.floor(n_samples * missing_rate))

In [4]:
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool)))
missing_samples

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True,

In [5]:
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

In [6]:
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)


Score without the samples containing missing values = 0.48


In [7]:
# Estimate the score after imputation of the missing values
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)

Score after imputation of the missing values = 0.57


In [8]:
%pylab inline
import pandas as pd
from io import StringIO

Populating the interactive namespace from numpy and matplotlib


Initial, we use StringIO function to read a string and convert into pandas data frame

In [9]:
csv_data ='''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,8.0
10.0,11.0,12.0,'''
csv_data = unicode(csv_data)
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,8.0,
2,10.0,11.0,12.0,


In [10]:
# display in array
df.values

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,   8.,  nan],
       [ 10.,  11.,  12.,  nan]])

Except when a number of missing values is small, dropping missing values' drawback include: a risk of losing value information 

In [11]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy ='mean',axis=0 ) 
imputer.fit(df)
imputed_data =imputer.transform(df.values)
imputed_data

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,   8.,   4.],
       [ 10.,  11.,  12.,   4.]])

Other strategies are median, most_frequent

In [12]:
# Row mean will be computed as axis=0
imputer = Imputer(missing_values='NaN', strategy ='mean',axis=1 ) 
imputer.fit(df)
imputed_data =imputer.transform(df.values)
imputed_data

array([[  1.        ,   2.        ,   3.        ,   4.        ],
       [  5.        ,   6.        ,   8.        ,   6.33333333],
       [ 10.        ,  11.        ,  12.        ,  11.        ]])

## Categorical data
Categorical data may be further divided into nominal and ordinal. In additional, ordinal can be sorted 


In [13]:
import pandas as pd
df = pd.DataFrame([
    ['green','M',10.1,'class1'],
    ['red','L',13.2,'class2'],
    ['blue','XL',15.2,'class1']
])
df.columns=['color','size','price','label']
df

Unnamed: 0,color,size,price,label
0,green,M,10.1,class1
1,red,L,13.2,class2
2,blue,XL,15.2,class1


### Transforming categorical features
E.g: Ordinal features such as cloth sizes may define as X = M+1 = S +2. In other word, X > M > S, will convert into number

In [14]:
size_mapping = {'XL':3,'L':2 ,'M':1}
df['size']= df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,label
0,green,1,10.1,class1
1,red,2,13.2,class2
2,blue,3,15.2,class1


In [15]:
## we may transform back

In [16]:
inv_size_mapping = {v:k for k,v in size_mapping.items()}
inv_size_mapping

{1: 'M', 2: 'L', 3: 'XL'}

In [17]:
df['size']=df['size'].map(inv_size_mapping)
df

Unnamed: 0,color,size,price,label
0,green,M,10.1,class1
1,red,L,13.2,class2
2,blue,XL,15.2,class1


### Label encoding
Target class can be transformed to numeric with mapping

In [18]:
class_mapping = {label:idx for idx, label in enumerate(np.unique(df['label']))}
class_mapping

{'class1': 0, 'class2': 1}

In [19]:
df['label']=df['label'].map(class_mapping)
df

Unnamed: 0,color,size,price,label
0,green,M,10.1,0
1,red,L,13.2,1
2,blue,XL,15.2,0


### Label mapping with sklearn

In [42]:
df = pd.DataFrame([
    ['green','M',10.1,'class1'],
    ['red','L',13.2,'class2'],
    ['blue','XL',15.2,'class1']
])
df.columns=['color','size','price','label']
df

Unnamed: 0,color,size,price,label
0,green,M,10.1,class1
1,red,L,13.2,class2
2,blue,XL,15.2,class1


In [43]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(df.label)

array([0, 1, 0], dtype=int64)

In [44]:
le.inverse_transform([1, 1, 0])

array(['class2', 'class2', 'class1'], dtype=object)

### Using LabelEncoding for nominal features

In [39]:
X=df[['color','size','price']].values
X

array([['green', 'M', 10.1],
       ['red', 'L', 13.2],
       ['blue', 'XL', 15.2]], dtype=object)

In [46]:
le_color= LabelEncoder()
X[:,0]= le_color.fit_transform(X[:,0])
X

array([[1, 'M', 10.1],
       [2, 'L', 13.2],
       [0, 'XL', 15.2]], dtype=object)

In [49]:
X[:,1]= le_color.fit_transform(X[:,1])
X

array([[1, 1, 10.1],
       [2, 0, 13.2],
       [0, 2, 15.2]], dtype=object)

### One hot encoder sklearn
However, sklearn provides OneHotEncoder to transform rm nominal features into nunmeric (similar to dummy in pandas data frame)

In [51]:
from sklearn.preprocessing import OneHotEncoder
# OnehotEnder for color feature 
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()
# Note that first origin column transforms to 2 new columns, so does second origin column

array([[  0. ,   1. ,   0. ,   1. ,  10.1],
       [  0. ,   0. ,   1. ,   0. ,  13.2],
       [  1. ,   0. ,   0. ,   2. ,  15.2]])

Note that: OneHotEncoder return sparse matrix by default so we use toarraty() to convert it back to a regular (dense) numpy array. We can avoid by using parameter in OnehotEncoder...,(**sparse=False**)

### One hot encoding with get_dummy pandas

In [53]:
# Create a dataframe
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'sex': ['male', 'female', 'male', 'female', 'female']}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'sex'])
df

Unnamed: 0,first_name,last_name,sex
0,Jason,Miller,male
1,Molly,Jacobson,female
2,Tina,Ali,male
3,Jake,Milner,female
4,Amy,Cooze,female


In [54]:
df_sex = pd.get_dummies(df['sex'])
df = pd.concat([df, df_sex], axis=1) # or simply  df.join(df_sex)
df

Unnamed: 0,first_name,last_name,sex,female,male
0,Jason,Miller,male,0,1
1,Molly,Jacobson,female,1,0
2,Tina,Ali,male,0,1
3,Jake,Milner,female,1,0
4,Amy,Cooze,female,1,0
