In [33]:
import pandas as pd
from io import StringIO


In [34]:
csv_data = \
    ''' A,B,C,D
    1.2,2.0,3.0,4.0
    5.0,6.0,,8.0
    10.0,11.0,12.0'''

#csv_data = unicode(csv_data)

df = pd.read_csv(StringIO(csv_data))
df


Unnamed: 0,A,B,C,D
0,1.2,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [35]:
df.isnull().sum()


 A    0
B     0
C     1
D     1
dtype: int64

# Eliminate Training Examples or Features with Missing Values


In [36]:
#remove rows that contain NaNs (if training example isnt strictly necessary)
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.2,2.0,3.0,4.0


In [37]:
#remove columns that contain NaNs (if feature isnt strictly necessary)
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.2,2.0
1,5.0,6.0
2,10.0,11.0


In [38]:
#remove rows where all columns are NaN
df.dropna(how = 'all')

Unnamed: 0,A,B,C,D
0,1.2,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [39]:
#remove rows that have fewer than four real values
df.dropna(thresh = 4)

Unnamed: 0,A,B,C,D
0,1.2,2.0,3.0,4.0


In [40]:
#only remove rows where NaN appears in specific columns
df.dropna(subset=['C'])



Unnamed: 0,A,B,C,D
0,1.2,2.0,3.0,4.0
2,10.0,11.0,12.0,


# Imputing: interpolate the missing value


In [41]:
from sklearn.impute import SimpleImputer
import numpy as np

imr = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data


array([[ 1.2,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [42]:
from sklearn.impute import KNNImputer
import numpy as np

imr = KNNImputer(missing_values = np.nan)
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1.2,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

# Handling Categorical Data

In [None]:
import pandas as pd
df = pd.DataFrame([
    ['green', 'M', 10.1, 'Class2'],
    ['red', 'L', 13.5, 'Class1'],
    ['blue', 'XL', 15.3, 'Class2']
])
#color is a nominal feature, size is an ordinal feature
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,Class2
1,red,L,13.5,Class1
2,blue,XL,15.3,Class2


In [44]:
size_mapping = {'XL': 3,
                'L' : 2,
                'M' : 1}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,Class2
1,red,2,13.5,Class1
2,blue,3,15.3,Class2


In [45]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)


0     M
1     L
2    XL
Name: size, dtype: object

# Encoding Class Labels

In [46]:
import numpy as np
class_mapping = {label : idx for idx,label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'Class1': 0, 'Class2': 1}

In [47]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [48]:
inv_class_mapping = {v: k for k,v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,Class2
1,red,2,13.5,Class1
2,blue,3,15.3,Class2


In [49]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

In [50]:
class_le.inverse_transform(y)

array(['Class2', 'Class1', 'Class2'], dtype=object)

# One-Hot Encoding Nominal Features

In [None]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0])
X
#this is incorrect because it will have your classifier assume some ordinality to the data

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [53]:
from sklearn.preprocessing import OneHotEncoder
X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:,0].reshape(-1,1)).toarray()


array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [None]:
from sklearn.compose import ColumnTransformer
X = df[['color', 'size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(),[0]),
    ('nothing', 'passthrough', [1,2])
])

c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [55]:
pd.get_dummies(df[['color', 'size', 'price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,False,True,False
1,2,13.5,False,False,True
2,3,15.3,True,False,False


In [56]:
X = df[['color', 'size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(categories='auto', drop = 'first'),[0]),
    ('nothing', 'passthrough', [1,2])
])

c_transf.fit_transform(X).astype(float)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])

# if we are unsure of the relationship between labels within an ordinal feature, we can threshold encode the features


In [57]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'Class2'],
    ['red', 'L', 13.5, 'Class1'],
    ['blue', 'XL', 15.3, 'Class2']
])

df.columns = ['color', 'size', 'price', 'classlabel']

df


Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,Class2
1,red,L,13.5,Class1
2,blue,XL,15.3,Class2


In [58]:
df['x > m'] = df['size'].apply(lambda x:1 if x in {'L', 'XL'} else 0)
df['x > L'] = df['size'].apply(lambda x:1 if x ==  'XL' else 0)
del df['size']
df

Unnamed: 0,color,price,classlabel,x > m,x > L
0,green,10.1,Class2,0,0
1,red,13.5,Class1,1,0
2,blue,15.3,Class2,1,1
