# BUILDING GOOD TRAINIG DATASETS - DATA PREPROCESSING

- The quality of the data and the amount of useful information that it contains are key factors that determine how well a machine learing algorithm can learn.
- Data Preprocessing Technique:
  - Removing and imputing missing values from the dataset
  - Getting Categorical data into shape for machine learning Algorithms
  -  Selecting relevant features for the model construction

# DEALING WITH MISSING DATA

#### Identifying missing values in tabular data

In [1]:
import pandas as pd
from io import StringIO

In [2]:
csv_data = \
"""A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,
"""
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [3]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [4]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

#### Eliminating training examples or features with missing values

In [5]:
df.dropna(axis='rows')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [6]:
df.dropna(axis='columns')

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [7]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [8]:
df.dropna(thresh=4) # fewer than 4 real values

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [9]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


#### Imputing missing values

In [11]:
from sklearn.impute import SimpleImputer
import numpy as np

In [12]:
imr = SimpleImputer(missing_values=np.nan,strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [14]:
imputed_data2 = imr.fit_transform(df.values)
imputed_data2

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [15]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [16]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [17]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


#### Using KNNIMPUTER

In [18]:
from sklearn.impute import KNNImputer

In [23]:
knn = KNNImputer(missing_values=np.nan,n_neighbors=5,weights='uniform') # weights=['uniform','distance']
imputed_data3 = knn.fit_transform(df.values)
imputed_data3

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

- The two essential methods of those estimators are fit and transform.
- The fit method is used to learn the parameters from the training data.
- The transform method uses those parameters to transform the data.

![image](./images/04_02.png)

![image](./images/04_03.png)

# HANDLING CATEGORICAL DATA

- Ordinal - sorted or ordered - t-shirt size would be ordinal feature - XL>L>M.
- Nominal - dont imply any order -  t-shirt color.

#### Categorical data encoding with pandas

In [52]:
import pandas as pd

In [53]:
df = pd.DataFrame([ 
    ['green','M',10.1,'class2'],
    ['red','L',13.5,'class1'],
    ['blue','XL',15.3,'class2']]  
)
df.columns = ['color','size','price','classlabel']

In [54]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [55]:
df_copy = df.copy()

In [56]:
df_copy

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


#### Mapping ordinal features

In [57]:
size_mapping ={
    'XL':3,
    'L':2,
    'M':1
}

df_copy['size'] = df_copy['size'].map(size_mapping)

In [58]:
df_copy

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [59]:
inv_size_mapping = {v:k for k,v in size_mapping.items()}
inv_size_mapping

{3: 'XL', 2: 'L', 1: 'M'}

In [60]:
df_copy['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [45]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


#### Encoding class labels

In [61]:
import numpy as np

In [62]:
class_mapping ={label:idx for idx,label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [63]:
df_copy['classlabel'] = df_copy['classlabel'].map(class_mapping)

In [64]:
df_copy

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [65]:
inv_class_mapping = {v:k for k,v in class_mapping.items()}
inv_class_mapping


{0: 'class1', 1: 'class2'}

In [67]:
df_copy['classlabel']=df_copy['classlabel'].map(inv_class_mapping)

In [68]:
df_copy

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


#### Using LabelEncoder

In [70]:
df_copy_1 = df.copy()

In [69]:
from sklearn.preprocessing import LabelEncoder

In [71]:
le = LabelEncoder()
y = le.fit_transform(df_copy_1['classlabel'].values)
y

array([1, 0, 1])

In [72]:
df_copy_1

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [73]:
df_copy_1['classlabel'] = le.fit_transform(df_copy_1['classlabel'].values)

In [74]:
df_copy_1

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,1
1,red,L,13.5,0
2,blue,XL,15.3,1


In [75]:
df_copy_1['classlabel'] = le.inverse_transform(df_copy_1['classlabel'])

In [76]:
df_copy_1

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


#### Performing one-hot encoding on nominal features

In [78]:
X = df[['color','size','price']].values
color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0])
X

array([[1, 'M', 10.1],
       [2, 'L', 13.5],
       [0, 'XL', 15.3]], dtype=object)

In [79]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [80]:
from sklearn.preprocessing import OneHotEncoder

In [81]:
X = df[['color','size','price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:,0].reshape(-1,1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [82]:
from sklearn.compose import ColumnTransformer # (name,transformer,columns)

In [87]:
X = df[['color','size','price']].values

In [88]:
X

array([['green', 'M', 10.1],
       ['red', 'L', 13.5],
       ['blue', 'XL', 15.3]], dtype=object)

In [90]:
pd.get_dummies(df[['color','size','price']],dtype=float)

Unnamed: 0,price,color_blue,color_green,color_red,size_L,size_M,size_XL
0,10.1,0.0,1.0,0.0,0.0,1.0,0.0
1,13.5,0.0,0.0,1.0,1.0,0.0,0.0
2,15.3,1.0,0.0,0.0,0.0,0.0,1.0


In [91]:
df_new = pd.DataFrame([ 
    ['green','M',10.1,'class2'],
    ['red','L',13.5,'class1'],
    ['blue','XL',15.3,'class2']]  
)
df_new.columns = ['color','size','price','classlabel']

In [92]:
df_new

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


You use label encoding when your categorical data has a clear inherent order between categories (like size - small, medium, large), while one-hot encoding is used when the order of categories doesn't matter (like colors - red, blue, green) and you want to represent each category as a separate binary feature.

- LabelEncoder Cannot Be Used in ColumnTransformer

In [96]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


ct = ColumnTransformer([('one_hot_encoder',OneHotEncoder(),[0,3]),
                        ('ordinal_encoder',OrdinalEncoder(),[1])],remainder='passthrough')

X = ct.fit_transform(df_new.values)
X

array([[0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 10.1],
       [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 13.5],
       [1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 15.3]], dtype=object)

- When we are using one-hot encoding datasets,we have to keep in mind that this is introduces multicollinearity,which can be an issue for certain methods.

In [101]:
df_pd_dummies = df.copy()

In [102]:
df_pd_dummies

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [106]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
# df_pd_dummies['color'] = le.fit_transform(df_pd_dummies['color'])
df_pd_dummies['size'] = le.fit_transform(df_pd_dummies['size'])
df_pd_dummies['classlabel'] = le.fit_transform(df_pd_dummies['classlabel'])

In [107]:
pd.get_dummies(df_pd_dummies[['color','size','price']],dtype=float,drop_first=True)

Unnamed: 0,size,price,color_green,color_red
0,1,10.1,1.0,0.0
1,0,13.5,0.0,1.0
2,2,15.3,0.0,0.0


In [108]:
df_pd_dummies

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,0,13.5,0
2,blue,2,15.3,1


In [111]:
color_ohe_new = OneHotEncoder(categories='auto',drop='first')
c_transf_new = ColumnTransformer([
    ('onehot',color_ohe_new,[0]),
    ('nothing','passthrough',[1,2,3])
])

c_transf_new.fit_transform(df_pd_dummies.values).astype(float)

array([[ 1. ,  0. ,  1. , 10.1,  1. ],
       [ 0. ,  1. ,  0. , 13.5,  0. ],
       [ 0. ,  0. ,  2. , 15.3,  1. ]])

#### Additional encoding schemes for nominal data

- one-hot encoding use for unordered categorical data.
- Binary encoding - use for small categorical columns
- Use hyperparameter for which is best encoding methods 

#### Optional : encoding ordinal features

In [112]:
df_opt = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])

df_opt.columns = ['color', 'size', 'price', 'classlabel']
df_opt

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


- Using apply methods of pandas DataFrame to write custom lambda expressions in order to encode these variables using the value-threshold approach 

In [113]:
df_opt['X>M'] = df_opt['size'].apply(lambda x:1 if x in {'L','XL'} else 0)
df_opt['X>L'] = df_opt['size'].apply(lambda x:1 if x == 'XL' else 0)

In [114]:
df_opt

Unnamed: 0,color,size,price,classlabel,X>M,X>L
0,green,M,10.1,class2,0,0
1,red,L,13.5,class1,1,0
2,blue,XL,15.3,class2,1,1
