# Feature Engineering II,
### a.k.a.
* ### Advanced Feature Engineering 
* ### Feature Engineering with _scikit-learn_

(Concepts are the same as in the intro to FE, how we transform the data is different)

In [2]:
# stuff you know already
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
# new stuff !!
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### 1. Get Data

In [40]:
df = pd.read_csv('../data/all_penguins_clean.csv', na_values='.')

In [41]:
df.isna().sum()

studyName               0
Sample Number           0
Species                 0
Region                  0
Island                  0
Stage                   0
Individual ID           0
Clutch Completion       0
Date Egg                0
Culmen Length (mm)      2
Culmen Depth (mm)       2
Flipper Length (mm)     2
Body Mass (g)           2
Real ID                 0
Sex                    11
dtype: int64

In [42]:
df

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Real ID,Sex
0,PAL0708,1,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,A_0,MALE
1,PAL0708,2,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,A_1,FEMALE
2,PAL0708,3,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,A_2,FEMALE
3,PAL0708,4,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,11/16/07,,,,,A_3,
4,PAL0708,5,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,11/16/07,36.7,19.3,193.0,3450.0,A_4,FEMALE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,PAL0910,120,Gentoo,Anvers,Biscoe,"Adult, 1 Egg Stage",N38A2,No,12/1/09,,,,,G_339,
340,PAL0910,121,Gentoo,Anvers,Biscoe,"Adult, 1 Egg Stage",N39A1,Yes,11/22/09,46.8,14.3,215.0,4850.0,G_340,FEMALE
341,PAL0910,122,Gentoo,Anvers,Biscoe,"Adult, 1 Egg Stage",N39A2,Yes,11/22/09,50.4,15.7,222.0,5750.0,G_341,MALE
342,PAL0910,123,Gentoo,Anvers,Biscoe,"Adult, 1 Egg Stage",N43A1,Yes,11/22/09,45.2,14.8,212.0,5200.0,G_342,FEMALE


In [43]:
df[df['Real ID'].isin(['A_3', 'G_339'])]

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Real ID,Sex
3,PAL0708,4,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,11/16/07,,,,,A_3,
339,PAL0910,120,Gentoo,Anvers,Biscoe,"Adult, 1 Egg Stage",N38A2,No,12/1/09,,,,,G_339,


In [44]:
df = df[~df['Real ID'].isin(['A_3', 'G_339'])] # equivalent to .isnotin() 

In [45]:
df.isna().sum()

studyName              0
Sample Number          0
Species                0
Region                 0
Island                 0
Stage                  0
Individual ID          0
Clutch Completion      0
Date Egg               0
Culmen Length (mm)     0
Culmen Depth (mm)      0
Flipper Length (mm)    0
Body Mass (g)          0
Real ID                0
Sex                    9
dtype: int64

In [46]:
X = df[['Island', 'Sex', 'Flipper Length (mm)', 'Body Mass (g)']]
y = df['Species']

### 2. Train-Test Split

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

### 3. Explore the Data

In [48]:
X_train.isna().sum()

Island                 0
Sex                    7
Flipper Length (mm)    0
Body Mass (g)          0
dtype: int64

In [49]:
X_train['Sex'].value_counts() 

FEMALE    135
MALE      131
Name: Sex, dtype: int64

## 🔧 4. Feature Engineer 🔨

In [50]:
X_train

Unnamed: 0,Island,Sex,Flipper Length (mm),Body Mass (g)
40,Dream,FEMALE,182.0,3150.0
322,Biscoe,FEMALE,215.0,4975.0
243,Biscoe,MALE,215.0,5050.0
279,Biscoe,MALE,224.0,5550.0
166,Dream,FEMALE,190.0,3575.0
...,...,...,...,...
88,Dream,MALE,189.0,3950.0
64,Biscoe,FEMALE,184.0,2850.0
327,Biscoe,MALE,219.0,5500.0
338,Biscoe,FEMALE,214.0,4925.0


In [51]:
X_train.describe()

Unnamed: 0,Flipper Length (mm),Body Mass (g)
count,273.0,273.0
mean,200.534799,4188.278388
std,14.24186,809.740769
min,172.0,2850.0
25%,189.0,3550.0
50%,196.0,4000.0
75%,214.0,4775.0
max,231.0,6300.0


Q: How do we want to feature engineer our columns?

A: 
* Impute missing values in the sex column
* One-hot-encode sex and island columns
* Scale numerical columns (flipper length and body mass)

--------------

Introducing `ColumnTransformer`: 

https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
        
Takes as parameters:
* list of tuples of the format `(name, transformer, columns)`
* what to do with columns not included: `remainder='drop'/'passthrough'`

`ColumnTransformer` helps us to do all our feature engineering in one go.

---------------

In [52]:
numerical_columns = ['Flipper Length (mm)', 'Body Mass (g)']
categorical_columns = ['Island', 'Sex']

In [80]:
column_transformer = ColumnTransformer([
    ('sex_imputer', SimpleImputer(strategy='most_frequent'), ['Sex']),
    ('island_ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['Island']),
    ('num_scaler', MinMaxScaler(), numerical_columns)
])

Here lies the beauty of column transformer:

In [81]:
column_transformer.fit(X_train)
X_train_fe = column_transformer.transform(X_train)
X_test_fe = column_transformer.transform(X_test) # DO NOT FIT ON TEST SET

In [82]:
X_train_fe.shape

(273, 6)

In [83]:
X_train_fe[0]

array(['FEMALE', 0.0, 1.0, 0.0, 0.1694915254237288, 0.08695652173913038],
      dtype=object)

What happens when we want to transform the same column twice?

---------------------

Introducing `Pipeline`: 

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

Takes as parameters: 
* list of tuples of the format `(name, transformer)`

`Pipeline` allows us to apply sequential transformations to the same data.

----------------------

In [114]:
categorical_pipeline = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('cat_ohe', OneHotEncoder(handle_unknown='error', drop='if_binary'))
])

In [115]:
column_transformer = ColumnTransformer([
    ('sex_pipeline', categorical_pipeline, ['Sex']),
    ('island_ohe', OneHotEncoder(handle_unknown='error', drop='first'), ['Island']),
    ('num_scaler', MinMaxScaler(), numerical_columns)
])

In [116]:
column_transformer.fit(X_train)
X_train_fe = column_transformer.transform(X_train)
X_test_fe = column_transformer.transform(X_test) # DO NOT FIT ON TEST SET

In [117]:
X_train_fe[0] # one sex column, two island columns, two numerical columns

array([0.        , 1.        , 0.        , 0.16949153, 0.08695652])

In [118]:
X_test_fe[0]

array([1.        , 1.        , 0.        , 0.55932203, 0.47826087])

In [119]:
X_train_fe

array([[0.        , 1.        , 0.        , 0.16949153, 0.08695652],
       [0.        , 0.        , 0.        , 0.72881356, 0.61594203],
       [1.        , 0.        , 0.        , 0.72881356, 0.63768116],
       ...,
       [1.        , 0.        , 0.        , 0.79661017, 0.76811594],
       [0.        , 0.        , 0.        , 0.71186441, 0.60144928],
       [0.        , 0.        , 1.        , 0.16949153, 0.10144928]])

### 5. Train Model

In [104]:
m = LogisticRegression()

In [105]:
m.fit(X_train_fe, y_train)

### 6. Optimize

Skip for now — more next week

### 7. Calculate Test Score

In [106]:
m.score(X_train_fe, y_train)

0.8937728937728938

In [107]:
m.score(X_test_fe, y_test)

0.8840579710144928