# Feature-Engineering

Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.

In [17]:
from pydataset import data
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
import sklearn.linear_model
import sklearn.feature_selection
import sklearn.preprocessing

## 1.

### Load the tips dataset.

In [2]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.shape

(244, 7)

In [4]:
# rename size column because it is a Pandas attribute
df = df.rename(columns={'size': 'group_size'})
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,group_size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


### a. 
- Create a column named price_per_person. 
- This should be the total bill divided by the party size.

In [5]:
df['price_per_person'] = df.total_bill / df.group_size
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,group_size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


### b. 
- Before using any of the methods discussed in the lesson, 
- which features do you think would be most important for predicting the tip amount?

To predict tips in the future, I think the best method would be to look at the average amount tipped in comparrison to the total bill (otherwise known as the tip percentage). To do this, we will need to add a tip percentage to the dataframe.

In [6]:
df['tip_percentage'] = df.tip / df.total_bill
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,group_size,price_per_person,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,0.146808


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   group_size        244 non-null    int64  
 7   price_per_person  244 non-null    float64
 8   tip_percentage    244 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 19.1+ KB


-- I will only need the numeric values for this.

In [8]:
# Make dataframe with only numeric values
df = df[['total_bill', 'tip', 'group_size', 'price_per_person', 'tip_percentage']]
df.head()

Unnamed: 0,total_bill,tip,group_size,price_per_person,tip_percentage
1,16.99,1.01,2,8.495,0.059447
2,10.34,1.66,3,3.446667,0.160542
3,21.01,3.5,3,7.003333,0.166587
4,23.68,3.31,2,11.84,0.13978
5,24.59,3.61,4,6.1475,0.146808


### c. 
- Use select k best to select the top 2 features for predicting tip amount. What are they?

In [9]:
# split the data before feature engineering
def split(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)
    return train, validate, test

In [10]:
train, validate, test = split(df)

In [12]:
train.shape, validate.shape, test.shape

((136, 5), (59, 5), (49, 5))

In [14]:
target = "tip"

# split train into X (dataframe, drop target) & y (series, keep target only)
X_train = train.drop(columns=[target])
y_train = train[target]

# split validate into X (dataframe, drop target) & y (series, keep target only)
X_validate = validate.drop(columns=[target])
y_validate = validate[target]

# split test into X (dataframe, drop target) & y (series, keep target only)
X_test = test.drop(columns=[target])
y_test = test[target]

X_train.head()

Unnamed: 0,total_bill,group_size,price_per_person,tip_percentage
19,16.97,3,5.656667,0.206246
173,7.25,2,3.625,0.710345
119,12.43,2,6.215,0.144811
29,21.7,2,10.85,0.198157
238,32.83,2,16.415,0.035638


In [32]:
# Scale using MinMaxScaler
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler, (fit the thing)
scaler.fit(X_train)

# Use the scaler to transform train, validate, test (use the thing)
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [34]:
# Apply Select K Best filter method

from sklearn.feature_selection import SelectKBest, f_regression

In [39]:
# Make the filter
kbest = SelectKBest(f_regression, k=2)

# Fit the filter
kbest.fit(X_train, y_train)

# Obtain results as a dataframe
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=X_train.columns)
kbest_results

Unnamed: 0,p,f
total_bill,7.18647e-20,115.984909
group_size,1.341642e-12,61.259089
price_per_person,0.001310327,10.777792
tip_percentage,7.63645e-06,21.686627


In [36]:
# Obtain list of top 2 features to predict tip amount
X_train.columns[kbest.get_support()]

Index(['total_bill', 'group_size'], dtype='object')

In [72]:
X_train_transformed = pd.DataFrame(
    kbest.transform(X_train),
    index=X_train.index,
    columns=X_train.columns[kbest.get_support()]
)
X_train_transformed.head().sort_index()

Unnamed: 0,total_bill,group_size
19,16.97,3.0
29,21.7,2.0
119,12.43,2.0
173,7.25,2.0
238,32.83,2.0


### d. 
- Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [68]:
model = LinearRegression()
# Make the thing
rfe = RFE(model, n_features_to_select=2)
# Fit the thing
rfe.fit(X_train, y_train)
# Obtain results in dataframe
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=X_train.columns).sort_values(by = 'rfe_ranking')

Unnamed: 0,rfe_ranking
group_size,1
tip_percentage,1
total_bill,2
price_per_person,3


In [53]:
X_train.columns[rfe.get_support()]

Index(['group_size', 'tip_percentage'], dtype='object')

In [54]:
X_train_transformed = pd.DataFrame(
    rfe.transform(X_train),
    index=X_train.index,
    columns=X_train.columns[rfe.support_]
)
X_train_transformed.head()

Unnamed: 0,group_size,tip_percentage
19,3.0,0.206246
173,2.0,0.710345
119,2.0,0.144811
29,2.0,0.198157
238,2.0,0.035638


### e. 
- Why do you think select k best and recursive feature elimination might give different answers for the top features? 
Kbest looks at each feature in isolation
RFE looks at multiple features - disentangle correclated independent variables

-Example - exam score - coffee consumption and hours of sleep




- Does this change as you change the number of features your are selecting?

## 2. 

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [78]:
def select_kbest(X, y, k):
      
    # Make the filter
    kbest = SelectKBest(f_regression, k=k)

    # Fit the filter (may need to call with X_train, Y_train or other)
    kbest.fit(X, y)
    
    # Obtain list of top k features to predict tip amount
    return X_train.columns[kbest.get_support()]


In [79]:
select_kbest(X_train, y_train, 2)

Index(['total_bill', 'group_size'], dtype='object')

## 3.

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [80]:
def rfe(X, y, n):
    
    model = LinearRegression()
    
    # Make the thing
    rfe = RFE(model, n_features_to_select = n)
    # Fit the thing
    rfe.fit(X, y)
    # Obtain results in
    return X_train.columns[rfe.get_support()]

In [81]:
rfe(X_train, y_train, 2)

Index(['group_size', 'tip_percentage'], dtype='object')

## 4. 

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).