# Feature Engineering Exercises

In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pydataset import data

import wrangle
import modeling
import evaluate

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

Load the tips dataset.

In [57]:
swiss = data('swiss')

In [2]:
tips = data('tips')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
tips['price_per_person'] = tips['total_bill'] / tips['size']

In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?


- I think the most important features will be number of people, total bill, and meal time. 

Use select k best to select the top 2 features for predicting tip amount. What are they?


In [6]:
train, validate, test = wrangle.subset_df(tips)
train.shape, validate.shape, test.shape

((146, 8), (49, 8), (49, 8))

In [7]:
X_train, y_train, X_validate, y_validate, X_test, y_test = modeling.xy_subsets(train, validate, test, 'tip')

In [8]:
X_train.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,price_per_person
195,16.58,Male,Yes,Thur,Lunch,2,8.29
77,17.92,Male,Yes,Sat,Dinner,2,8.96
42,17.46,Male,No,Sun,Dinner,2,8.73
109,18.24,Male,No,Sat,Dinner,2,9.12
224,15.98,Female,No,Fri,Lunch,3,5.326667


In [9]:
mms = MinMaxScaler()
X_train[['total_bill', 'price_per_person']] = mms.fit_transform(train[['total_bill', 'price_per_person']])
X_train.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,price_per_person
195,0.240346,Male,Yes,Thur,Lunch,2,0.311207
77,0.270084,Male,Yes,Sat,Dinner,2,0.349713
42,0.259876,Male,No,Sun,Dinner,2,0.336494
109,0.277186,Male,No,Sat,Dinner,2,0.358908
224,0.227031,Female,No,Fri,Lunch,3,0.1409


In [11]:
X_train = pd.get_dummies(X_train, columns=['sex', 'smoker', 'day', 'time', 'size'])

In [12]:
f_selector = SelectKBest(f_regression, k=2)
f_selector.fit(X_train, y_train)

In [14]:
f_select_mask = f_selector.get_support()

In [16]:
X_train.iloc[:,f_select_mask]

Unnamed: 0,total_bill,size_2
195,0.240346,1
77,0.270084,1
42,0.259876,1
109,0.277186,1
224,0.227031,0
79,0.377497,1
145,0.237017,1
27,0.169108,1
236,0.095872,1
214,0.166889,1


- The best two features using SelectKBest were total bill and whether or not the table size was 2 people

Use recursive feature elimination to select the top 2 features for tip amount. What are they?


In [17]:
X_train.head()

Unnamed: 0,total_bill,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
195,0.240346,0.311207,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,0
77,0.270084,0.349713,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0
42,0.259876,0.336494,0,1,1,0,0,0,1,0,1,0,0,1,0,0,0,0
109,0.277186,0.358908,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0
224,0.227031,0.1409,1,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0


In [18]:
lm = LinearRegression()
rfe = RFE(lm, n_features_to_select=2)

rfe.fit(X_train, y_train)

In [19]:
ranks = rfe.ranking_
columns = X_train.columns.tolist()

feature_ranks = pd.DataFrame({'ranking': ranks,
                              'feature': columns})
feature_ranks.sort_values('ranking')

Unnamed: 0,ranking,feature
0,1,total_bill
1,1,price_per_person
16,2,size_5
12,3,size_1
13,4,size_2
17,5,size_6
2,6,sex_Female
14,7,size_3
11,8,time_Lunch
4,9,smoker_No


- According to the RSE function, the two best features were price per person and total bill

Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?


- I think the reason that the answers were different was because the SelectKBest function's answers were correlated to eachother, while the RSe functions were not. I'm unsure if this will change significantly as we change the number of features, but I would say that yes it probably does

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.



In [84]:
def select_kbest(X_train_scaled, y_train, k):
    
    f_selector = SelectKBest(f_regression, k=k)
    f_selector.fit(X_train_scaled, y_train)
    
    f_select_mask = f_selector.get_support()
    best_features = X_train_scaled.iloc[:,f_select_mask]
    
    return best_features.head(k)

In [37]:
select_kbest(X_train, y_train, 3)

Unnamed: 0,total_bill,size_2,size_4
195,0.240346,1,0
77,0.270084,1,0
42,0.259876,1,0
109,0.277186,1,0
224,0.227031,0,0
79,0.377497,1,0
145,0.237017,1,0
27,0.169108,1,0
236,0.095872,1,0
214,0.166889,1,0


Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.



In [82]:
def rfe(X_train_scaled, y_train, n_features):
    
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=n_features)
    
    rfe.fit(X_train_scaled, y_train)
    
    ranks = rfe.ranking_
    columns = X_train_scaled.columns.tolist()
        
    feature_ranks = pd.DataFrame({'ranking': ranks,
                                  'feature': columns})
    feature_ranks = feature_ranks.sort_values('ranking')
    
    return feature_ranks.head(n_features)

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).



In [59]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [61]:
train, validate, test = wrangle.subset_df(swiss)
train.shape, validate.shape, test.shape

((28, 6), (9, 6), (10, 6))

In [65]:
cols_to_scale = ['Agriculture', 'Catholic', 'Infant.Mortality']

In [66]:
train_scaled, validate_scaled, test_scaled = wrangle.scale_data(train, validate, test, MinMaxScaler(), cols_to_scale)

In [67]:
train_scaled.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Sarine,82.9,0.402685,16,13,0.911906,0.860759
Grandson,71.7,0.252349,17,8,0.011753,0.582278
Yverdon,65.4,0.460403,15,8,0.040368,0.740506
Herens,77.3,1.0,5,2,1.0,0.474684
Rive Droite,44.7,0.421477,16,29,0.493408,0.468354


In [68]:
X_train, y_train, X_validate, y_validate, X_test, y_test = modeling.xy_subsets(train_scaled, validate_scaled, test_scaled, 'Fertility')

In [69]:
X_train.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Sarine,0.402685,16,13,0.911906,0.860759
Grandson,0.252349,17,8,0.011753,0.582278
Yverdon,0.460403,15,8,0.040368,0.740506
Herens,1.0,5,2,1.0,0.474684
Rive Droite,0.421477,16,29,0.493408,0.468354


In [71]:
X_train = pd.get_dummies(X_train, columns=['Examination', 'Education'])

In [87]:
select_kbest(X_train, y_train, 3)

Unnamed: 0,Catholic,Infant.Mortality,Education_29
Sarine,0.911906,0.860759,0
Grandson,0.011753,0.582278,0
Yverdon,0.040368,0.740506,0


In [83]:
rfe(X_train, y_train, 3)

Unnamed: 0,ranking,feature
32,1,Education_29
1,1,Catholic
10,1,Examination_15
