# Keep-it-dry! Preprocessing Part 2: Data Scaling and Encoding

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [15]:
df_iterimp = pd.read_csv('kid_train_iterimp.csv', index_col=0)
df_knnimp = pd.read_csv('kid_train_knnimp.csv', index_col=0)
df_iterimp_submission = pd.read_csv('kid_test_iterimp.csv', index_col=0)
df_knnimp_submission = pd.read_csv('kid_test_knnimp.csv', index_col=0)

In [16]:
y_iterimp = df_iterimp.pop('failure')
y_knnimp = df_knnimp.pop('failure')

## IterativeImputer Data (iterimp)

### Encoding (both iterimp and knnimp)
Since each product category can be described using the four attributes, we'll go with:
1. "one-hot encoding" for "product", and 
2. "label encoding" for "attribute_0" and "attribute_1"

In [17]:
# measurement_0 - measurement_2 is a special case yall

measure_int = ['measurement_0', 'measurement_1', 'measurement_2']
df_iterimp[measure_int] = df_iterimp[measure_int].astype('float')
df_knnimp[measure_int] = df_knnimp[measure_int].astype('float')
df_iterimp.dtypes

id                  int64
product_code       object
loading           float64
attribute_0        object
attribute_1        object
attribute_2         int64
attribute_3         int64
measurement_0     float64
measurement_1     float64
measurement_2     float64
measurement_3     float64
measurement_4     float64
measurement_5     float64
measurement_6     float64
measurement_7     float64
measurement_8     float64
measurement_9     float64
measurement_10    float64
measurement_11    float64
measurement_12    float64
measurement_13    float64
measurement_14    float64
measurement_15    float64
measurement_16    float64
measurement_17    float64
dtype: object

In [18]:
cat_features = df_iterimp.select_dtypes(include=['object']).columns
continuous_features = df_iterimp.select_dtypes(include=['float64']).columns

print(cat_features)
print(continuous_features)

Index(['product_code', 'attribute_0', 'attribute_1'], dtype='object')
Index(['loading', 'measurement_0', 'measurement_1', 'measurement_2',
       'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6',
       'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10',
       'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14',
       'measurement_15', 'measurement_16', 'measurement_17'],
      dtype='object')


In [19]:
onehot = OneHotEncoder(sparse_output=False)
onehot1 = OneHotEncoder(sparse_output=False)

df_oh_dummies = onehot.fit_transform(np.array(df_iterimp['product_code'].values).reshape(-1, 1))
df_oh_dummies = pd.DataFrame(df_oh_dummies, columns=onehot.get_feature_names_out(['product_code']))

df_submission_oh_dummies = onehot1.fit_transform(np.array(df_iterimp_submission['product_code'].values).reshape(-1, 1))
df_submission_oh_dummies = pd.DataFrame(df_submission_oh_dummies, columns=onehot1.get_feature_names_out(['product_code']))

In [20]:
df_iterimp['attribute_1'].value_counts()

attribute_1
material_8    10865
material_5    10362
material_6     5343
Name: count, dtype: int64

In [21]:
attribute_mapping = {'material_7':7.0, 'material_8':8.0, 'material_5':5.0, 'material_6':6.0}

df_iterimp[['attribute_0', 'attribute_1']] = df_iterimp[['attribute_0', 'attribute_1']].replace(attribute_mapping)
df_iterimp_submission[['attribute_0', 'attribute_1']] = df_iterimp_submission[['attribute_0', 'attribute_1']].replace(attribute_mapping)

df_knnimp[['attribute_0', 'attribute_1']] = df_knnimp[['attribute_0', 'attribute_1']].replace(attribute_mapping)
df_knnimp_submission[['attribute_0', 'attribute_1']] = df_knnimp_submission[['attribute_0', 'attribute_1']].replace(attribute_mapping)


In [22]:
df_knnimp_submission.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,26570,F,119.57,5.0,6.0,6,4,6,9,6,...,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
1,26571,F,113.51,5.0,6.0,6,4,11,8,0,...,19.368,12.032,13.998,18.34,12.473,17.468,16.708,14.776,14.102,537.037
2,26572,F,112.16,5.0,6.0,6,4,8,12,4,...,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
3,26573,F,112.72,5.0,6.0,6,4,8,11,10,...,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
4,26574,F,208.0,5.0,6.0,6,4,14,16,8,...,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


In [23]:
# DataFrame reassembling for 'df_oh_dummies' and 'df_submission_oh_dummies'

df_iterimp = pd.concat([df_iterimp, df_oh_dummies], axis=1)
df_knnimp = pd.concat([df_knnimp, df_oh_dummies], axis=1)

df_iterimp_submission = pd.concat([df_iterimp_submission, df_submission_oh_dummies], axis=1)
df_knnimp_submission = pd.concat([df_knnimp_submission, df_submission_oh_dummies], axis=1)

In [24]:
df_knnimp_submission

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,product_code_F,product_code_G,product_code_H,product_code_I
0,26570,F,119.57,5.0,6.0,6,4,6,9,6,...,13.772,13.659,16.825,13.742,17.7100,634.612,1.0,0.0,0.0,0.0
1,26571,F,113.51,5.0,6.0,6,4,11,8,0,...,12.473,17.468,16.708,14.776,14.1020,537.037,1.0,0.0,0.0,0.0
2,26572,F,112.16,5.0,6.0,6,4,8,12,4,...,10.907,13.363,15.737,17.065,16.0210,658.995,1.0,0.0,0.0,0.0
3,26573,F,112.72,5.0,6.0,6,4,8,11,10,...,10.933,15.501,15.667,12.620,16.1110,594.301,1.0,0.0,0.0,0.0
4,26574,F,208.00,5.0,6.0,6,4,14,16,8,...,11.941,16.070,16.183,13.324,17.1500,801.044,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20770,47340,I,144.74,7.0,5.0,9,5,0,4,9,...,11.562,17.246,15.131,15.209,17.2516,696.466,0.0,0.0,0.0,1.0
20771,47341,I,74.53,7.0,5.0,9,5,4,8,7,...,13.564,15.494,15.296,13.812,16.5010,613.249,0.0,0.0,0.0,1.0
20772,47342,I,67.73,7.0,5.0,9,5,10,11,2,...,11.134,16.519,15.525,14.175,17.7280,783.349,0.0,0.0,0.0,1.0
20773,47343,I,126.15,7.0,5.0,9,5,8,16,11,...,9.319,15.817,17.403,16.437,15.1790,745.210,0.0,0.0,0.0,1.0


### Scaling

In [25]:
print(continuous_features)

Index(['loading', 'measurement_0', 'measurement_1', 'measurement_2',
       'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6',
       'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10',
       'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14',
       'measurement_15', 'measurement_16', 'measurement_17'],
      dtype='object')


In [26]:
df_iterimp_minmax = df_iterimp.copy()
df_iterimp_standard = df_iterimp.copy()
df_iterimp_robust = df_iterimp.copy()

minmax = MinMaxScaler()
standard = StandardScaler()
robust = RobustScaler()

df_iterimp_minmax[continuous_features] = minmax.fit_transform(df_iterimp[continuous_features])
df_iterimp_standard[continuous_features] = standard.fit_transform(df_iterimp[continuous_features])
df_iterimp_robust[continuous_features] = robust.fit_transform(df_iterimp[continuous_features])

In [27]:
# Apply the same scaler to test data

df_iterimp_submission_minmax = df_iterimp_submission.copy()
df_iterimp_submission_standard = df_iterimp_submission.copy()
df_iterimp_submission_robust = df_iterimp_submission.copy()

df_iterimp_submission_minmax[continuous_features] = minmax.transform(df_iterimp_submission[continuous_features])
df_iterimp_submission_standard[continuous_features] = standard.transform(df_iterimp_submission[continuous_features])
df_iterimp_submission_robust[continuous_features] = robust.transform(df_iterimp_submission[continuous_features])



In [28]:
df_iterimp_robust.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,product_code_A,product_code_B,product_code_C,product_code_D,product_code_E
0,0,A,-0.879646,7.0,8.0,9,5,0.0,0.0,-0.5,...,-0.414278,-0.319912,-1.010927,-0.793236,0.384251,1.0,0.0,0.0,0.0,0.0
1,1,A,-0.781142,7.0,8.0,9,5,1.166667,-0.833333,-0.75,...,-0.629651,-0.338426,-0.311094,-0.366421,-0.119937,1.0,0.0,0.0,0.0,0.0
2,2,A,-0.831731,7.0,8.0,9,5,0.833333,-1.166667,-0.25,...,0.805447,1.353395,-0.46587,0.676955,-0.234739,1.0,0.0,0.0,0.0,0.0
3,3,A,-0.448409,7.0,8.0,9,5,1.0,-1.0,0.0,...,-0.254016,-0.26613,0.593392,0.328111,0.766384,1.0,0.0,0.0,0.0,0.0
4,4,A,1.340497,7.0,8.0,9,5,0.333333,-1.0,0.5,...,0.421836,-1.744758,-0.949736,-0.014422,-0.747825,1.0,0.0,0.0,0.0,0.0


### Iterimp Export

In [29]:
df_iterimp_minmax['failure'] = y_iterimp
df_iterimp_standard['failure'] = y_iterimp
df_iterimp_robust['failure'] = y_iterimp

In [30]:
df_iterimp_minmax.to_csv('kid_train_ii_mm.csv')
df_iterimp_standard.to_csv('kid_train_ii_st.csv')
df_iterimp_robust.to_csv('kid_train_ii_ro.csv')

df_iterimp_submission_minmax.to_csv('kid_test_ii_mm.csv')
df_iterimp_submission_standard.to_csv('kid_test_ii_st.csv')
df_iterimp_submission_robust.to_csv('kid_test_ii_ro.csv')

## KNNImputer Data (knnimp)

### Scaling

In [31]:
df_knnimp_minmax = df_knnimp.copy()
df_knnimp_standard = df_knnimp.copy()
df_knnimp_robust = df_knnimp.copy()

minmax = MinMaxScaler()
standard = StandardScaler()
robust = RobustScaler()

df_knnimp_minmax[continuous_features] = minmax.fit_transform(df_knnimp[continuous_features])
df_knnimp_standard[continuous_features] = standard.fit_transform(df_knnimp[continuous_features])
df_knnimp_robust[continuous_features] = robust.fit_transform(df_knnimp[continuous_features])

In [32]:
# Apply the same scaler to test data

df_knnimp_submission_minmax = df_knnimp_submission.copy()
df_knnimp_submission_standard = df_knnimp_submission.copy()
df_knnimp_submission_robust = df_knnimp_submission.copy()

df_knnimp_submission_minmax[continuous_features] = minmax.transform(df_knnimp_submission[continuous_features])
df_knnimp_submission_standard[continuous_features] = standard.transform(df_knnimp_submission[continuous_features])
df_knnimp_submission_robust[continuous_features] = robust.transform(df_knnimp_submission[continuous_features])

### knnimp export

In [33]:
df_knnimp_minmax['failure'] = y_knnimp
df_knnimp_standard['failure'] = y_knnimp
df_knnimp_robust['failure'] = y_knnimp

In [34]:
df_knnimp_minmax.to_csv('kid_train_ki_mm.csv')
df_knnimp_standard.to_csv('kid_train_ki_st.csv')
df_knnimp_robust.to_csv('kid_train_ki_ro.csv')

df_knnimp_submission_minmax.to_csv('kid_test_ki_mm.csv')
df_knnimp_submission_standard.to_csv('kid_test_ki_st.csv')
df_knnimp_submission_robust.to_csv('kid_test_ki_ro.csv')