# Feature Engineering

## Exercises
Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.

## Imports

In [1]:
# Standard imports
import numpy as np
import pandas as pd

# My imports
import wrangle as w
import explore as e

from pydataset import data

# Stats
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

### 1. Load the tips dataset.

    a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [2]:
# data('tips', show_doc=True) # view the documentation for the dataset
df = data('tips') # load the dataset and store it in a variable

In [3]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.50,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2.00,Female,Yes,Sat,Dinner,2
242,22.67,2.00,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
df = df.rename(columns={'size':'number_of_people'})
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,number_of_people
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.50,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2.00,Female,Yes,Sat,Dinner,2
242,22.67,2.00,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   number_of_people  244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [6]:
def get_object_cols(df):
    '''
    This function takes in a dataframe and identifies the columns that are object types
    and returns a list of those column names. 
    '''
    # get a list of the column names that are objects (from the mask)
    object_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    return object_cols


def get_numeric_cols(df):
    '''
    This function takes in a dataframe and identifies the columns that are object types
    and returns a list of those column names. 
    '''
    # get a list of the column names that are objects (from the mask)
    num_cols = df.select_dtypes(exclude=['object', 'category']).columns.tolist()
    
    return num_cols

In [7]:
get_object_cols(df)

['sex', 'smoker', 'day', 'time']

In [8]:
get_numeric_cols(df)

['total_bill', 'tip', 'number_of_people']

In [9]:
# Added a new column named price_per_person
df['price_per_person'] = round(df['total_bill']/df['number_of_people'],2)

In [10]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,number_of_people,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45
3,21.01,3.50,Male,No,Sun,Dinner,3,7.00
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15
...,...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3,9.68
241,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59
242,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34
243,17.82,1.75,Male,No,Sat,Dinner,2,8.91


#### b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
    
        Party size and total_bill

#### c. Use select k best to select the top 2 features for predicting tip amount. What are they?
    
        - total bill
        - size

In [11]:
dummy_tips = pd.get_dummies(df[['sex','smoker','day','time']], dummy_na=False, drop_first=[True, True])
df = pd.concat([df, dummy_tips], axis=1)

In [12]:
df = df.drop(columns =['sex', 'smoker', 'day','time'])

In [13]:
df

Unnamed: 0,total_bill,tip,number_of_people,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,8.49,0,0,0,1,0,0
2,10.34,1.66,3,3.45,1,0,0,1,0,0
3,21.01,3.50,3,7.00,1,0,0,1,0,0
4,23.68,3.31,2,11.84,1,0,0,1,0,0
5,24.59,3.61,4,6.15,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
240,29.03,5.92,3,9.68,1,0,1,0,0,0
241,27.18,2.00,2,13.59,0,1,1,0,0,0
242,22.67,2.00,2,11.34,1,1,1,0,0,0
243,17.82,1.75,2,8.91,1,0,1,0,0,0


In [14]:
train_validate, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

In [15]:
# scaled the data
scaler = MinMaxScaler()
scaler.fit(train)
train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns).set_index([train.index.values])
validate_scaled = pd.DataFrame(scaler.transform(validate), columns=validate.columns).set_index([validate.index.values])
test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns).set_index([test.index.values])
train_scaled.head()

Unnamed: 0,total_bill,tip,number_of_people,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
19,0.307114,0.3125,0.4,0.150581,0.0,0.0,0.0,1.0,0.0,0.0
173,0.092355,0.51875,0.2,0.031977,1.0,1.0,0.0,1.0,0.0,0.0
119,0.206805,0.1,0.2,0.18314,0.0,0.0,0.0,0.0,1.0,1.0
29,0.411622,0.4125,0.2,0.452326,1.0,0.0,1.0,0.0,0.0,0.0
238,0.657534,0.02125,0.2,0.776163,1.0,1.0,1.0,0.0,0.0,0.0


In [16]:
X_train, y_train = train_scaled.drop(columns=['tip']), train.tip
X_validate, y_validate = validate_scaled.drop(columns=['tip']), validate.tip
X_test, y_test = test_scaled.drop(columns=['tip']), test.tip

In [17]:
# MAKE the thing
kbest = SelectKBest(f_regression, k=2)

# FIT the thing
model = kbest.fit(X_train, y_train)

In [18]:
kbest.scores_

array([1.15984909e+02, 6.12590886e+01, 1.07835019e+01, 1.15479204e+00,
       3.44908618e-01, 3.18953072e-03, 2.40040378e+00, 1.07637964e+00,
       1.79864746e+00])

In [19]:
# p value: 
kbest.pvalues_

array([7.18647033e-20, 1.34164190e-12, 1.30659413e-03, 2.84479443e-01,
       5.57997755e-01, 9.55046793e-01, 1.23662519e-01, 3.01377404e-01,
       1.82144884e-01])

In [20]:
kbest.feature_names_in_

array(['total_bill', 'number_of_people', 'price_per_person', 'sex_Male',
       'smoker_Yes', 'day_Sat', 'day_Sun', 'day_Thur', 'time_Lunch'],
      dtype=object)

In [21]:
kbest_results = pd.DataFrame(
                dict(pvalues=kbest.pvalues_, feature_scores=kbest.scores_),
                index = X_train.columns)

In [22]:
kbest_results

Unnamed: 0,pvalues,feature_scores
total_bill,7.18647e-20,115.984909
number_of_people,1.341642e-12,61.259089
price_per_person,0.001306594,10.783502
sex_Male,0.2844794,1.154792
smoker_Yes,0.5579978,0.344909
day_Sat,0.9550468,0.00319
day_Sun,0.1236625,2.400404
day_Thur,0.3013774,1.07638
time_Lunch,0.1821449,1.798647


In [23]:
# get_support() will output a boolean mask to tell me which features were selected
kbest.get_support()

array([ True,  True, False, False, False, False, False, False, False])

In [24]:
# we can apply this mask to the columns in our original dataframe
X_train.columns[kbest.get_support()]

Index(['total_bill', 'number_of_people'], dtype='object')

####    d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?
    
        - day_Thur
        - time_Lunch

In [25]:
# make a model object to use in RFE process.
# The model is here to give us metrics on feature importance and model score
# allowing us to recursively reduce the number of features to reach our desired space

model = LinearRegression()

In [26]:
# MAKE the thing
rfe = RFE(model, n_features_to_select=2)

# FIT the thing
rfe.fit(X_train, y_train)

In [27]:
# Get feature ranking
# Selected features are assigned a rank 1

rfe.ranking_

array([1, 2, 1, 3, 7, 5, 6, 4, 8])

In [28]:
rfe_ranking = pd.DataFrame({'rfe_ranking':rfe.ranking_},index=X_train.columns)
rfe_ranking.sort_values(by=['rfe_ranking'], ascending=True)

Unnamed: 0,rfe_ranking
total_bill,1
price_per_person,1
number_of_people,2
sex_Male,3
day_Thur,4
day_Sat,5
day_Sun,6
smoker_Yes,7
time_Lunch,8


In [29]:
X_train_RFEtransformed = pd.DataFrame(
    rfe.transform(X_train),
    index=X_train.index,
    columns = X_train.columns[rfe.support_])

In [30]:
rfe.get_support()

array([ True, False,  True, False, False, False, False, False, False])

In [31]:
top_k_rfe = X_train.columns[rfe.get_support()]

In [32]:
top_k_rfe

Index(['total_bill', 'price_per_person'], dtype='object')

In [33]:
top_k_rfe = rfe.get_feature_names_out()
top_k_rfe

array(['total_bill', 'price_per_person'], dtype=object)

In [34]:
# X_train_RFEtransformed

    e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?
    
    - Not Sure why
    - I didn't change any of the features, so not sure why the recursive selected day_Thur and time_Lunch.

### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [35]:
def select_kbest(X,y,k):
    # X = X_train
    # y = y_train
    # k = the number of features to select (we are only sending two as of right now)
    
    # MAKE the thing
    kbest = SelectKBest(f_regression, k=k)

    # FIT the thing
    kbest.fit(X, y)
    
    # Create a DATAFRAME
    kbest_results = pd.DataFrame(
                dict(pvalues=kbest.pvalues_, feature_scores=kbest.scores_),
                index = X.columns)
    
    # we can apply this mask to the columns in our original dataframe
    top_k = X.columns[kbest.get_support()]
    
    return top_k

In [36]:
select_kbest(X_train,y_train,4)

Index(['total_bill', 'number_of_people', 'price_per_person', 'day_Sun'], dtype='object')

### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [37]:
def rfe(X,y,k):
    # X = X_train
    # y = y_train
    # k = the number of features to select (we are only sending two as of right now)
    
    
    # make a model object to use in RFE process.
    # The model is here to give us metrics on feature importance and model score
    # allowing us to recursively reduce the number of features to reach our desired space
    model = LinearRegression()
    
    # MAKE the thing
    rfe = RFE(model, n_features_to_select=k)

    # FIT the thing
    rfe.fit(X, y)
    
    X_train_RFEtransformed = pd.DataFrame(
        rfe.transform(X),
        index=X.index,
        columns = X.columns[rfe.support_])
    
    top_k_rfe = X.columns[rfe.get_support()]
    
    return top_k_rfe

In [38]:
rfe(X_train,y_train,2)

Index(['total_bill', 'price_per_person'], dtype='object')

### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [39]:
# data('tips', show_doc=True) # view the documentation for the dataset
df = data('swiss') # load the dataset and store it in a variable

In [40]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [41]:
train, validate, test = w.get_split(df)

In [42]:
train.shape, validate.shape, test.shape

((25, 6), (12, 6), (10, 6))

In [43]:
X_train, y_train = train.drop(columns=['Fertility']), train.Fertility
X_validate, y_validate = validate.drop(columns=['Fertility']), validate.Fertility
X_test, y_test = test.drop(columns=['Fertility']), test.Fertility

In [44]:
to_scale = X_train.columns.tolist()

In [45]:
X_validate.shape

(12, 5)

In [46]:
X_train_scaled, X_validate_scaled, X_test_scaled = w.scale_data(X_train,X_validate,X_test,to_scale)
X_train_scaled

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rolle,0.647561,0.40625,0.290323,0.054508,0.122449
Lavaux,0.796341,0.5,0.258065,0.004508,0.5
Nyone,0.526829,0.59375,0.354839,0.130533,0.163265
Conthey,0.953659,0.0,0.032258,0.997029,0.0
Yverdon,0.509756,0.375,0.225806,0.03791,0.755102
Oron,0.77439,0.28125,0.0,0.0,0.602041
Cossonay,0.75122,0.59375,0.129032,0.004303,0.367347
St Maurice,0.831707,0.1875,0.258065,0.990369,0.27551
Franches-Mnt,0.390244,0.0625,0.129032,0.932377,0.520408
Orbe,0.565854,0.53125,0.16129,0.018443,0.020408


In [48]:
select_kbest(X_train,y_train,3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [49]:
rfe(X_train,y_train,3)

Index(['Agriculture', 'Examination', 'Infant.Mortality'], dtype='object')

In [51]:
select_kbest(X_train_scaled,y_train,3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [52]:
rfe(X_train_scaled,y_train,3)

Index(['Agriculture', 'Examination', 'Infant.Mortality'], dtype='object')