# Feature Engineering

### Categorical Feature Engineering

In [None]:
# import needed packages
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [None]:
# load needed data from the data folder
# could use kernel from another notebook but easier to load separately
full_data = pd.read_csv('data/cumulative_data_fe.csv')
train_data = pd.read_csv('data/train_data_fe.csv')
test_data = pd.read_csv('data/test_data_fe.csv')

In [None]:
# see shapes of dataframes
# number of rows should be maintained after feature engineering and column count should increase by 1
print(full_data.shape)
print(test_data.shape)
print(train_data.shape)

(78612, 46)
(15722, 46)
(62890, 46)


In [None]:
# define encoder from sklearn package
# use encoder to label soil and stone as 0 and 1 in full data
# warning can be ignored - process completes as expected
enc = LabelEncoder()
full_data["stone_soil_enc"] = enc.fit_transform(full_data[["stone_soil"]])

  y = column_or_1d(y, warn=True)


In [None]:
# use encoder to label soil and stone as 0 and 1 in train data
train_data["stone_soil_enc"] = enc.fit_transform(train_data[["stone_soil"]])

In [None]:
# use encoder to label soil and stone as 0 and 1 in test data
test_data["stone_soil_enc"] = enc.fit_transform(test_data[["stone_soil"]])

In [None]:
# check full data that output is as expected
# see the new column and 0s for soil and 1s for stone
print(full_data.head())
print(full_data.tail())

      id  img_id        da        dp    fwidth   flength  fthickness  \
0  25611   10977  0.430173  0.415076  0.342711  0.297421    0.687812   
1  48302   15470  0.406686  0.422433  0.351257  0.281709    0.675758   
2  32915   12616  0.405261  0.440657  0.321034  0.289897    0.573542   
3  22866   10293  0.395802  0.439217  0.344475  0.310602    0.653172   
4  10277    7209  0.355956  0.345535  0.224694  0.264181    0.504201   

    elength  ethickness    ewidth  ...  ellipticity  fiber_length  \
0  0.312368    0.696663  0.363869  ...     0.017830      0.329471   
1  0.286009    0.683217  0.377781  ...     0.013708      0.467853   
2  0.323973    0.590267  0.320041  ...     0.031852      0.444296   
3  0.309595    0.628076  0.348678  ...     0.024057      0.566623   
4  0.298762    0.504102  0.237380  ...     0.037989      0.267104   

   fiber_width  krumbein_rnd  stone_soil  thick_vol_prod  thick_perm_prod  \
0     0.380450      0.028846        soil        0.154961         0.462472  

In [None]:
# check test data that output is as expected
# see new column and 0s for soil. Since there are more soil values, will need to do another check to check stone
print(test_data.head())
print(test_data.tail())

      id  img_id        da        dp    fwidth   flength  fthickness  \
0  30066   12106  0.003760  0.003116  0.002659  0.003465    0.008584   
1   3678    5479  0.009108  0.009347  0.005209  0.010781    0.016819   
2  59061   20704  0.003857  0.003168  0.002469  0.003145    0.007971   
3  35039   12826  0.003663  0.002932  0.002469  0.002952    0.007971   
4   1615    4906  0.005024  0.004242  0.003147  0.004471    0.010161   

    elength  ethickness    ewidth  ...  ellipticity  fiber_length  \
0  0.003529    0.009118  0.002933  ...     0.039171      0.000000   
1  0.011242    0.015799  0.005083  ...     0.098580      0.008609   
2  0.003635    0.009479  0.003050  ...     0.038350      0.000000   
3  0.003339    0.009299  0.002992  ...     0.033713      0.000000   
4  0.004945    0.011104  0.003572  ...     0.048686      0.004639   

   fiber_width  krumbein_rnd  stone_soil  thick_vol_prod  thick_perm_prod  \
0     0.000000           1.0        soil    2.540643e-09         0.000051  

In [None]:
# check train data that output is as expected
# see the new column and 0s for soil and 1s for stone
print(train_data.head())
print(train_data.tail())

      id  img_id        da        dp    fwidth   flength  fthickness  \
0  11680    7936  0.005536  0.003348  0.003611  0.002566    0.005662   
1  45478   15130  0.007464  0.004704  0.005979  0.003631    0.009375   
2  16623    8959  0.005823  0.003265  0.004310  0.002339    0.006758   
3  40689   13711  0.007587  0.004815  0.006018  0.003771    0.009436   
4   5106    5866  0.005536  0.003348  0.003611  0.002566    0.005662   

    elength  ethickness    ewidth  ...  ellipticity  fiber_length  \
0  0.003406    0.006415  0.003943  ...     0.035570      0.000000   
1  0.004436    0.008944  0.005497  ...     0.029433      0.000000   
2  0.003208    0.007217  0.004436  ...     0.024371      0.000000   
3  0.004694    0.008821  0.005422  ...     0.033913      0.004139   
4  0.003406    0.006415  0.003943  ...     0.035570      0.000000   

   fiber_width  krumbein_rnd  stone_soil  thick_vol_prod  thick_perm_prod  \
0     0.000000           1.0        soil    1.845767e-09         0.000052  

In [None]:
# check the values in test_data that equal stone (are not visible in head())
# new column is as expected
test_data.loc[test_data['stone_soil'] == 'stone']

Unnamed: 0,id,img_id,da,dp,fwidth,flength,fthickness,elength,ethickness,ewidth,...,ellipticity,fiber_length,fiber_width,krumbein_rnd,stone_soil,thick_vol_prod,thick_perm_prod,thick_trans_prod,rnd_ell_prod,stone_soil_enc
18,4643,17064,0.005542,0.005420,0.002822,0.006246,0.009110,0.006741,0.009750,0.003137,...,0.094233,0.005956,0.004907,1.000000,stone,4.003114e-09,0.000082,0.034768,0.291088,1
59,1433,16398,0.002334,0.001728,0.001818,0.001818,0.005869,0.002134,0.006681,0.002149,...,0.029221,0.007120,0.005039,1.000000,stone,9.393975e-10,0.000025,0.000000,0.235778,1
61,4738,17110,0.004440,0.004032,0.002985,0.004129,0.009636,0.004226,0.010292,0.003311,...,0.042987,0.000000,0.000000,1.000000,stone,2.796843e-09,0.000068,0.047891,0.242652,1
118,2932,16647,0.004278,0.005184,0.001736,0.006524,0.005606,0.006635,0.006049,0.001946,...,0.162094,0.006013,0.003242,1.000000,stone,9.073726e-10,0.000053,0.023966,0.265139,1
121,3588,16761,0.005802,0.005577,0.003826,0.006289,0.012351,0.006043,0.012368,0.003979,...,0.056076,0.005097,0.006222,1.000000,stone,8.646727e-09,0.000110,0.053784,0.224142,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15663,2229,16525,0.008686,0.007410,0.006457,0.007081,0.020848,0.007586,0.020764,0.006680,...,0.031105,0.000000,0.000000,0.515815,stone,3.870745e-08,0.000224,0.088942,0.281798,1
15665,578,16237,0.002917,0.002985,0.001818,0.003465,0.005869,0.004015,0.005326,0.001714,...,0.105487,0.003894,0.002848,1.000000,stone,9.393975e-10,0.000036,0.032729,0.284365,1
15679,5254,18568,0.004019,0.008902,0.002062,0.010589,0.006657,0.007924,0.005146,0.001656,...,0.235075,0.011244,0.001577,1.000000,stone,3.106417e-09,0.000096,0.034724,0.135902,1
15688,2683,16616,0.011863,0.012018,0.006349,0.012942,0.020498,0.014052,0.020764,0.006680,...,0.091770,0.010862,0.009990,1.000000,stone,6.260871e-08,0.000335,0.063980,0.310414,1


In [None]:
# check new shapes of dataframes are as expected
print(full_data.shape)
print(test_data.shape)
print(train_data.shape)

(78612, 47)
(15722, 47)
(62890, 47)


In [None]:
# store data as new files in the original folder
full_data.to_csv('data/cumulative_data_fe.csv', index = False)
train_data.to_csv('data/train_data_fe.csv', index = False)
test_data.to_csv('data/test_data_fe.csv', index = False)