In [1]:
#standard imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pydataset import data
import wrangle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

#### 1. Load the tips dataset.

In [2]:
tips = sns.load_dataset("tips")
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [4]:
#since we have a some object type data, we need to convert them
get_dummies = pd.get_dummies(tips[["sex","smoker","day","time"]])
get_dummies.head(6)

Unnamed: 0,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,0,1,0,1,0,0,0,1,0,1
1,1,0,0,1,0,0,0,1,0,1
2,1,0,0,1,0,0,0,1,0,1
3,1,0,0,1,0,0,0,1,0,1
4,0,1,0,1,0,0,0,1,0,1
5,1,0,0,1,0,0,0,1,0,1


In [5]:
#add new dataframe to the original
tips = pd.concat([tips,get_dummies], axis = 1)
tips.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,Female,No,Sun,Dinner,2,0,1,0,1,0,0,0,1,0,1
1,10.34,1.66,Male,No,Sun,Dinner,3,1,0,0,1,0,0,0,1,0,1
2,21.01,3.5,Male,No,Sun,Dinner,3,1,0,0,1,0,0,0,1,0,1
3,23.68,3.31,Male,No,Sun,Dinner,2,1,0,0,1,0,0,0,1,0,1
4,24.59,3.61,Female,No,Sun,Dinner,4,0,1,0,1,0,0,0,1,0,1


In [6]:
#drop object columns 
tips = tips.drop(columns = ["sex","smoker","day","time"])


##### a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [7]:
#size is a reserved word so use it inside []
tips["price_per_person"] = (tips.total_bill)/(tips["size"])
tips.head()

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner,price_per_person
0,16.99,1.01,2,0,1,0,1,0,0,0,1,0,1,8.495
1,10.34,1.66,3,1,0,0,1,0,0,0,1,0,1,3.446667
2,21.01,3.5,3,1,0,0,1,0,0,0,1,0,1,7.003333
3,23.68,3.31,2,1,0,0,1,0,0,0,1,0,1,11.84
4,24.59,3.61,4,0,1,0,1,0,0,0,1,0,1,6.1475


#### b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount

In [8]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   size              244 non-null    int64  
 3   sex_Male          244 non-null    uint8  
 4   sex_Female        244 non-null    uint8  
 5   smoker_Yes        244 non-null    uint8  
 6   smoker_No         244 non-null    uint8  
 7   day_Thur          244 non-null    uint8  
 8   day_Fri           244 non-null    uint8  
 9   day_Sat           244 non-null    uint8  
 10  day_Sun           244 non-null    uint8  
 11  time_Lunch        244 non-null    uint8  
 12  time_Dinner       244 non-null    uint8  
 13  price_per_person  244 non-null    float64
dtypes: float64(3), int64(1), uint8(10)
memory usage: 10.1 KB


In [9]:
tips.corr()

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner,price_per_person
total_bill,1.0,0.675734,0.598315,0.144877,-0.144877,0.085721,-0.085721,-0.138174,-0.086168,0.054919,0.122953,-0.183118,0.183118,0.647497
tip,0.675734,1.0,0.489299,0.088862,-0.088862,0.005929,-0.005929,-0.095879,-0.055463,-0.00279,0.125114,-0.121629,0.121629,0.347393
size,0.598315,0.489299,1.0,0.086195,-0.086195,-0.133178,0.133178,-0.072598,-0.142184,-0.041121,0.193054,-0.103411,0.103411,-0.175412
sex_Male,0.144877,0.088862,0.086195,1.0,-1.0,0.002816,-0.002816,-0.194445,-0.07106,0.053957,0.168106,-0.205231,0.205231,0.108604
sex_Female,-0.144877,-0.088862,-0.086195,-1.0,1.0,-0.002816,0.002816,0.194445,0.07106,-0.053957,-0.168106,0.205231,-0.205231,-0.108604
smoker_Yes,0.085721,0.005929,-0.133178,0.002816,-0.002816,1.0,-1.0,-0.128534,0.244316,0.155744,-0.181624,-0.054921,0.054921,0.229938
smoker_No,-0.085721,-0.005929,0.133178,-0.002816,0.002816,-1.0,1.0,0.128534,-0.244316,-0.155744,0.181624,0.054921,-0.054921,-0.229938
day_Thur,-0.138174,-0.095879,-0.072598,-0.194445,0.194445,-0.128534,0.128534,1.0,-0.169608,-0.43448,-0.392566,0.917996,-0.917996,-0.09329
day_Fri,-0.086168,-0.055463,-0.142184,-0.07106,0.07106,0.244316,-0.244316,-0.169608,1.0,-0.216319,-0.195451,0.058159,-0.058159,0.024442
day_Sat,0.054919,-0.00279,-0.041121,0.053957,-0.053957,0.155744,-0.155744,-0.43448,-0.216319,1.0,-0.500682,-0.462709,0.462709,0.07653


In [10]:
#total bill and size

#### c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [11]:
#split data
train_validate, test = train_test_split(tips, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

In [12]:
train.shape,validate.shape, test.shape

((136, 14), (59, 14), (49, 14))

In [13]:
#define X and Y train
X_train = train.drop(columns = ["tip"])
Y_train = train.tip

In [14]:
#K best at work
#make the thing
kbest = SelectKBest(f_regression, k =2)
#fit the thing
kbest.fit(X_train, Y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fe85f3b5160>)

In [15]:
# get-support() will output a boolean mask to tell me which features were selected
# we can apply this mask to the columns in our original da
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

total bill and size are the 2 features for predicting tip amount

#### d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [16]:
#make a model object to use in RFE process.
# The model is here to give us metrics on feature importance and model score
# allowing us to recursively reduce the number of features 
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [17]:
# make thing
rfe = RFE(model, n_features_to_select=2)
# fit thing
rfe.fit(X_train, Y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [18]:
#this gives boolean
rfe.get_support()

array([False,  True, False, False, False, False,  True, False, False,
       False, False, False, False])

In [19]:
#mask the boolean to get the columns
X_train.columns[rfe.get_support()]

Index(['size', 'day_Thur'], dtype='object')

size and day(thursday) were top 2 features

#### e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting

In [20]:
#RFE probably gives a different answer because it uses an actual regression model to determine which features are most important whereas select k best uses statistical testing.

#### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [21]:
def select_kbest(X,y,k=3):
    #make the thing
    kbest = SelectKBest(f_regression, k=k)
    #fit the thing
    kbest.fit(X,y)
    features = X.columns[kbest.get_support()]
    return features
    
    

In [22]:
select_kbest(X_train,Y_train, 2)

Index(['total_bill', 'size'], dtype='object')

#### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [23]:
def select_rfe(X,y,  n_features_to_select = 2):
    #create the model
    rfe=RFE(LinearRegression(), n_features_to_select = n_features_to_select) 
    #fit the model
    rfe.fit(X,y)
    #use get_support()
    return X.columns[rfe.get_support()]

In [24]:
select_rfe(X_train,Y_train, n_features_to_select = 2)

Index(['size', 'day_Thur'], dtype='object')

#### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [25]:
#load swiss data, it lives inside pydataset
swiss = data("swiss")
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [26]:
#split data
train_validate, test = train_test_split(swiss, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

In [27]:
#check the split
train.shape, validate.shape, test.shape

((25, 6), (12, 6), (10, 6))

In [31]:
#set the X and y trains
X_train= train.drop(columns = ["Fertility"])
y_train = train["Fertility"]

In [32]:
#use the kbest function from above to find top 3 features
select_kbest(X_train,y_train, 3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [33]:
#use RFE function from above to find top 3 features
select_rfe(X_train,y_train, n_features_to_select = 3)

Index(['Agriculture', 'Examination', 'Infant.Mortality'], dtype='object')

the models did not give out all same features