In [40]:
import pandas as pd
import numpy as np
from pydataset import data

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn.linear_model
import sklearn.feature_selection
import sklearn.preprocessing

df = pydataset.data('tips')
df['smoker'] = (tips.smoker == 'Yes').astype(int)
df['dinner'] = (tips.time == 'Dinner').astype(int)

In [9]:
def split(df, stratify_by=""):
    '''
    take in a DataFrame and return train, validate, and test DataFrames.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)
    return train, validate, test

## 1. Load the tips dataset.

a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [10]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,dinner
1,16.99,1.01,Female,0,Sun,Dinner,2,1
2,10.34,1.66,Male,0,Sun,Dinner,3,1
3,21.01,3.5,Male,0,Sun,Dinner,3,1
4,23.68,3.31,Male,0,Sun,Dinner,2,1
5,24.59,3.61,Female,0,Sun,Dinner,4,1


In [11]:
# Rename the size column because .size is a built-in Pandas attribute
df = df.rename(columns={'size': 'number_of_people'})

In [12]:
df['tip_percentage'] = round(df['tip'] / df['total_bill'],2)

In [13]:
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,number_of_people,dinner,tip_percentage
1,16.99,1.01,Female,0,Sun,Dinner,2,1,0.06
2,10.34,1.66,Male,0,Sun,Dinner,3,1,0.16


### 1b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [16]:
#create new column price_per_person, use brackets since size is a keyword
df['price_per_person'] = (df['total_bill']) / (df['number_of_people'])

In [17]:
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,number_of_people,dinner,tip_percentage,price_per_person
1,16.99,1.01,Female,0,Sun,Dinner,2,1,0.06,8.495
2,10.34,1.66,Male,0,Sun,Dinner,3,1,0.16,3.446667


In [18]:
# For this specific exercise, we're only focusing on the numeric features
df = df[["total_bill", "tip", "number_of_people", "tip_percentage", "price_per_person"]]

In [19]:
# Split the data
# note: statify_by will not always work with continuous targets
train, validate, test = split(df, stratify_by="tip")

In [20]:
target = "tip"

# split train into X (dataframe, drop target) & y (series, keep target only)
X_train = train.drop(columns=[target, 'tip_percentage'])
y_train = train[target]

# split validate into X (dataframe, drop target) & y (series, keep target only)
X_validate = validate.drop(columns=[target, 'tip_percentage'])
y_validate = validate[target]

# split test into X (dataframe, drop target) & y (series, keep target only)
X_test = test.drop(columns=[target, 'tip_percentage'])
y_test = test[target]

X_train.head()

Unnamed: 0,total_bill,number_of_people,price_per_person
19,16.97,3,5.656667
173,7.25,2,3.625
119,12.43,2,6.215
29,21.7,2,10.85
238,32.83,2,16.415


In [23]:
# Scale (Make the thing)
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler, (fit the thing)
scaler.fit(X_train)

# Use the scaler to transform train, validate, test (use the thing)
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [24]:
pd.DataFrame(X_train_scaled).head(2)

Unnamed: 0,0,1,2
0,0.307114,0.4,0.150344
1,0.092355,0.2,0.032258


### 1c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

In [7]:
# I think the total bill, number of people and time will be most important to get these amounts.


### 1d.  Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?

In [25]:
k = 2

# Let's start with Select K Best:

# Make the thing
kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=2)

# fit the thing
kbest.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7f9552e6ca60>)

In [26]:
kbest.get_support()

array([ True,  True, False])

In [27]:
type(X_train.columns)

pandas.core.indexes.base.Index

In [30]:
# use the thing. 
kbest_features = list(X_train.columns[kbest.get_support()])

print("KBest's 2 best features are", kbest_features)

KBest's 2 best features are ['total_bill', 'number_of_people']


In [31]:
# RFE time

# Make the thing(s)
lm = sklearn.linear_model.LinearRegression()
rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=2)

# Fit the thing
rfe.fit(X_train, y_train)

# use the thing
rfe_columns = X_train.columns[rfe.support_].tolist()
rfe_columns

['total_bill', 'price_per_person']

### 1e. Use select k best and recursive feature elimination to select the top 2 features for predicting tip percentage. What are they?

In [32]:
# use the thing. 
kbest_features = list(y_train.columns[kbest.get_support()])

print("KBest's 2 best features are", kbest_features)

AttributeError: 'Series' object has no attribute 'columns'

In [33]:
# RFE time

# Make the thing(s)
lm = sklearn.linear_model.LinearRegression()
rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=2)

# Fit the thing
rfe.fit(X_train, y_train)

# use the thing
rfe_columns = X_train.columns[rfe.support_].tolist()
rfe_columns

['total_bill', 'price_per_person']

### 1f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

In [None]:
# they are looking at different methods to reach the conclusion. Yes it does change.

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [34]:
def select_kbest(X, y, k):
    # make the object
    kbest = sklearn.feature_selection.SelectKBest(
        sklearn.feature_selection.f_regression,
        k=k)

    # fit the object
    kbest.fit(X, y)
    
    # use the object (.get_support() is that array of booleans to filter the list of column names)
    return X.columns[kbest.get_support()].tolist()

kbest_feats = select_kbest(X_train, y_train, 2)

In [35]:
kbest_feats

['total_bill', 'number_of_people']

In [36]:
def show_features_rankings(X_train, rfe):
    """
    Takes in a dataframe and a fit RFE object in order to output the rank of all features
    """
    # rfe here is reference rfe from cell 15
    var_ranks = rfe.ranking_
    var_names = X_train.columns.tolist()
    ranks = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
    ranks = ranks.sort_values(by="Rank", ascending=True)
    return ranks

## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [37]:
def select_rfe(X, y, k):
    # make the thing
    lm = sklearn.linear_model.LinearRegression()
    rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=k)

    # Fit the thing
    rfe.fit(X, y)
    
    # use the thing
    features_to_use = X.columns[rfe.support_].tolist()
    
    # we need to send show_feature_rankings a trained/fit RFE object
    all_rankings = show_features_rankings(X, rfe)
    
    return features_to_use, all_rankings

In [38]:
# (Bonus)
# Use RFE to produce a dataframe of the ranked features

# rfe here is reference rfe from cell 15
var_ranks = rfe.ranking_
var_names = X_train.columns.tolist()
ranks = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
ranks.sort_values(by="Rank", ascending=True)

Unnamed: 0,Var,Rank
0,total_bill,1
2,price_per_person,1
1,number_of_people,2


## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [41]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [42]:
# Split the data
train, validate, test = split(swiss, stratify_by="Fertility")

# Setup X and y
X_train = train.drop(columns='Fertility')
y_train = train.Fertility

X_validate = validate.drop(columns='Fertility')
y_validate = validate.Fertility

X_test = test.drop(columns='Fertility')
y_test = test.Fertility

In [43]:
# Scale the data
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler
scaler.fit(X_train)

# Use the scaler to transform train, validate, test
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)


# Turn everything into a dataframe
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_validate_scaled = pd.DataFrame(X_validate_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_train.columns)

In [44]:
# Find the top 3 features using kbest
select_kbest(X_train_scaled, y_train, 3)

['Examination', 'Catholic', 'Infant.Mortality']

In [45]:
# Find the top 3 features using RFE
selected_features, all_rankings = select_rfe(X_train, y_train, 3)
print(selected_features)
all_rankings

['Agriculture', 'Examination', 'Infant.Mortality']


Unnamed: 0,Var,Rank
0,Agriculture,1
1,Examination,1
4,Infant.Mortality,1
2,Education,2
3,Catholic,3
