# Imports

In [1]:
import feature_engineering

from pydataset import data

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, f_regression
from sklearn.linear_model import LinearRegression

# Aqcuire the Data

In [2]:
tips = data('tips')

In [3]:
tips.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2


# Prepare the Data

In [4]:
tips.rename(columns={'size': 'party_size'}, inplace=True)

In [5]:
tips['tip_percentage'] = tips.tip / tips.total_bill

In [6]:
tips['price_per_person'] = tips.total_bill / tips.party_size

In [7]:
tips.head(1) 

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party_size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495


Hypothesis: Size of party will likely be the most attributing factor (the driver) for the tip. Tip percentage is likely the tip amount.

# Split data to test for tips

In [8]:
x = tips[['total_bill', 'tip_percentage', 'party_size', 'price_per_person']]

In [9]:
y = tips.tip

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42, train_size=.8)

# Use SelectKBest and Recursive Feature Elimination (RFE) to get best features

## Using SelectKBest to identify our two best features to be used

In [11]:
f_selector = SelectKBest(f_regression, k=2)

In [12]:
f_selector.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x125b710e0>)

In [13]:
X_reduced = f_selector.transform(X_train)

In [14]:
print(X_train.shape, X_reduced.shape)

(195, 4) (195, 2)


In [15]:
f_support = f_selector.get_support()

In [16]:
print(f_support)

[ True False  True False]


In [17]:
f_features = X_train.loc[:,f_support].columns.tolist()

In [18]:
print(str(len(f_features)), 'selected features')

2 selected features


In [19]:
print(f_features)

['total_bill', 'party_size']


## Using RFE to find best features for predicting the tip amount

In [20]:
# Initialize our linear regression object
lm = LinearRegression()

In [21]:
# Initialize the RFE object setting our hyperparamaters equal to the number of features we want to keep
rfe = RFE(lm, 2)

In [22]:
# Fit the RFE to our data
X_rfe = rfe.fit_transform(X_train, y_train)

In [23]:
lm.fit(X_rfe, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [24]:
# Get the column names from our rfe
mask = rfe.support_

In [25]:
rfe_features = X_train.loc[:,mask].columns.tolist()

In [26]:
print(str(len(rfe_features)), 'selected features')

2 selected features


In [27]:
print(rfe_features)

['total_bill', 'tip_percentage']


### Results of running SelectKBest and RFE

In [28]:
print(f'SelectKBest: {f_features}, RFE: {rfe_features}')

SelectKBest: ['total_bill', 'party_size'], RFE: ['total_bill', 'tip_percentage']


# Split data for identifying features to predict tip percentage

In [29]:
x = tips[['total_bill', 'tip', 'party_size', 'price_per_person']]

In [30]:
y = tips.tip

In [31]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42, train_size=.8)

## Use SelectKBest

In [32]:
f_selector = SelectKBest(f_regression, k=2)

In [33]:
f_selector.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x125b710e0>)

In [34]:
# Transform our data
X_reduced = f_selector.transform(X_train)

In [35]:
print(X_train.shape, X_reduced.shape)

(195, 4) (195, 2)


In [36]:
# Get the features SelectKBest wants us to keep
f_support = f_selector.get_support()

In [37]:
f_features = X_train.loc[:,f_support].columns.tolist()

In [38]:
print(f_features)

['total_bill', 'tip']


## Use RFE

In [39]:
lm = LinearRegression()

In [40]:
rfe = RFE(lm, 2)

In [41]:
X_rfe = rfe.fit_transform(X_train, y_train)

In [42]:
lm.fit(X_rfe, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [43]:
mask = rfe.support_

In [44]:
rfe_features = X_train.loc[:,mask].columns.tolist()

In [45]:
print(rfe_features)

['tip', 'party_size']


### Results of using RFE and SelectKBest for predicting tip percentage

In [46]:
print(f'SelectKBest: {f_features}, RFE: {rfe_features}')

SelectKBest: ['total_bill', 'tip'], RFE: ['tip', 'party_size']


### Reason why I'm getting different results is because the percentage of tips is heavily correlated with tip and total bill

In [47]:
def select_kbest(x, y, k):
    '''
    Takes:
        k - int: number of features
        x - df: driver features
        y - df: target feature
    Returns:
        X_reduced - df: transformed data
        f_features - list: name of features kept
    '''
    
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42, train_size=.8)

    f_selector = SelectKBest(f_regression, k)
    f_selector.fit(X_train, y_train)
    
    X_reduced = f_selector.transform(X_train)
    
    f_support = f_selector.get_support()
    f_features = X_train.loc[:,f_support].columns.tolist()
    
    return X_reduced, f_features

In [48]:
x = tips[['total_bill', 'tip_percentage', 'party_size', 'price_per_person']]

In [49]:
y = tips.tip

In [50]:
X_reduced, f_features = select_kbest(X_train, y_train, 2)

  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


In [51]:
print(f_features)

['total_bill', 'tip']


In [52]:
def rfe(x, y, k):
    '''
    Takes:
        k - int: number of features
        x - df: driver features
        y - df: target feature
    Returns:
        X_rfe - df: transformed data
        rfe_features - list: name of features kept
    '''
    
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42, train_size=.8)
    
    lm = LinearRegression()
    rfe = RFE(lm, k)
    X_rfe = rfe.fit_transform(X_train, y_train)
    lm.fit(X_rfe, y_train)
    
    mask = rfe.support_
    rfe_features = X_train.loc[:,mask].columns.tolist()
    
    return X_rfe, rfe_features

In [53]:
X_rfe, rfe_features = rfe(x, y, 2)

In [54]:
rfe_features

['total_bill', 'tip_percentage']

# Time to get Swiss up in here

In [55]:
swiss = data('swiss')

In [56]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [57]:
swiss.corr()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Fertility,1.0,0.353079,-0.645883,-0.663789,0.463685,0.416556
Agriculture,0.353079,1.0,-0.686542,-0.639523,0.401095,-0.060859
Examination,-0.645883,-0.686542,1.0,0.698415,-0.572742,-0.114022
Education,-0.663789,-0.639523,0.698415,1.0,-0.153859,-0.099322
Catholic,0.463685,0.401095,-0.572742,-0.153859,1.0,0.175496
Infant.Mortality,0.416556,-0.060859,-0.114022,-0.099322,0.175496,1.0


In [58]:
x = swiss[['Agriculture', 'Education', 'Catholic', 'Infant.Mortality']]

In [59]:
y = swiss.Fertility

In [60]:
X_reduced, f_features = select_kbest(x, y, 3)

In [61]:
print(f_features)

['Education', 'Catholic', 'Infant.Mortality']


In [62]:
x_rfe, rfe_features = rfe(x, y, 3)

In [63]:
print(rfe_features)

['Agriculture', 'Education', 'Infant.Mortality']


In [64]:
feature_engineering.select_kbest(x, y, 3)

Index(['Education', 'Catholic', 'Infant.Mortality'], dtype='object')