# Feature Engineering

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from pydataset import data
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
def train_validate_test(df, target):
    '''
    this function takes in a dataframe and splits it into 3 samples, 
    a test, which is 20% of the entire dataframe, 
    a validate, which is 24% of the entire dataframe,
    and a train, which is 56% of the entire dataframe. 
    It then splits each of the 3 samples into a dataframe with independent variables
    and a series with the dependent, or target variable. 
    The function returns 3 dataframes and 3 series:
    X_train (df) & y_train (series), X_validate & y_validate, X_test & y_test. 
    '''
    # split df into test (20%) and train_validate (80%)
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)

    # split train_validate off into train (70% of 80% = 56%) and validate (30% of 80% = 24%)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

        
    # split train into X (dataframe, drop target) & y (series, keep target only)
    X_train = train.drop(columns=[target])
    y_train = train[target]
    
    # split validate into X (dataframe, drop target) & y (series, keep target only)
    X_validate = validate.drop(columns=[target])
    y_validate = validate[target]
    
    # split test into X (dataframe, drop target) & y (series, keep target only)
    X_test = test.drop(columns=[target])
    y_test = test[target]
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

### 1. Load the tips dataset.

In [3]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


#### A. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [4]:
df['tip_percentage'] = round(df.tip / df.total_bill , 3)

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059
2,10.34,1.66,Male,No,Sun,Dinner,3,0.161
3,21.01,3.5,Male,No,Sun,Dinner,3,0.167
4,23.68,3.31,Male,No,Sun,Dinner,2,0.14
5,24.59,3.61,Female,No,Sun,Dinner,4,0.147


In [6]:
df.info()
#categorical: sex, smoker,day, time 
#quantitative: totalbill, tip, tip percentage, price per person

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   total_bill      244 non-null    float64
 1   tip             244 non-null    float64
 2   sex             244 non-null    object 
 3   smoker          244 non-null    object 
 4   day             244 non-null    object 
 5   time            244 non-null    object 
 6   size            244 non-null    int64  
 7   tip_percentage  244 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 17.2+ KB


#### B. Create a column named price_per_person. This should be the total bill divided by the party size.

In [7]:
#could not do df.size bc it was dividing by size of dataframe, not the size colume
#put size in brackets so it knew which size I was refering to
df['price_per_person'] = round(df.total_bill/df['size'], 2)

In [8]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,0.161,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,0.167,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,0.14,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.147,6.15


In [9]:
16.99/2

8.495

#### C. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

- the amount of the bill
- the size of the party
- time of day

#### D. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?


In [10]:
X_train, y_train, X_validate, y_validate, X_test, y_test = train_validate_test(df, 'tip')

In [11]:
def get_numeric_cols(df):
    '''
    This function takes in a dataframe and identifies the columns that are numeric types
    and returns a list of those column names. 
    '''
    # create a mask of columns whether they are object type or not
    mask = np.array(df.dtypes != "object")

        
    # get a list of the column names that are objects (from the mask)
    numeric_cols = df.iloc[:, mask].columns.tolist()
    
    return numeric_cols

In [12]:
def get_object_cols(df):
    '''
    This function takes in a dataframe and identifies the columns that are object types
    and returns a list of those column names. 
    '''
    # create a mask of columns whether they are object type or not
    mask = np.array(df.dtypes == "object")

        
    # get a list of the column names that are objects (from the mask)
    object_cols = df.iloc[:, mask].columns.tolist()
    
    return object_cols

In [13]:
get_numeric_cols(df)

['total_bill', 'tip', 'size', 'tip_percentage', 'price_per_person']

###### K Best

In [14]:
X_train.shape

(136, 8)

In [15]:
X_train.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,tip_percentage,price_per_person
19,16.97,Female,No,Sun,Dinner,3,0.206,5.66
173,7.25,Male,Yes,Sun,Dinner,2,0.71,3.62
119,12.43,Female,No,Thur,Lunch,2,0.145,6.22
29,21.7,Male,No,Sat,Dinner,2,0.198,10.85
238,32.83,Male,Yes,Sat,Dinner,2,0.036,16.42


In [16]:
X_train.columns.tolist()

['total_bill',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'tip_percentage',
 'price_per_person']

In [17]:
numeric_df = X_train.drop(['sex', 'smoker', 'time', 'day'], axis=1)
numeric_df.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person
19,16.97,3,0.206,5.66
173,7.25,2,0.71,3.62
119,12.43,2,0.145,6.22
29,21.7,2,0.198,10.85
238,32.83,2,0.036,16.42


In [18]:
from sklearn.feature_selection import SelectKBest, f_regression

# parameters: f_regression stats test, give me 2  features
f_selector = SelectKBest(f_regression, k=2)

# find the top 2 X's correlated with y
f_selector.fit(numeric_df, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

In [19]:
# boolean mask is a name for an array of booleans
feature_mask

array([ True,  True, False, False])

In [20]:
# get list of top K features. 
f_feature = numeric_df.iloc[:,feature_mask].columns.tolist()

In [21]:
f_feature

['total_bill', 'size']

In [22]:
#Total bill and size are the best two

#### E. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [23]:
object_cols = get_object_cols(df)

In [24]:

def create_dummies(df, object_cols):
    '''
    This function takes in a dataframe and list of object column names,
    and creates dummy variables of each of those columns. 
    It then appends the dummy variables to the original dataframe. 
    It returns the original df with the appended dummy variables. 
    '''
    
    # run pd.get_dummies() to create dummy vars for the object columns. 
    # we will drop the column representing the first unique value of each variable
    # we will opt to not create na columns for each variable with missing values 
    # (all missing values have been removed.)
    dummy_df = pd.get_dummies(df[object_cols], dummy_na=False, drop_first=True)
    
    # concatenate the dataframe with dummies to our original dataframe
    # via column (axis=1)
    df = pd.concat([df, dummy_df], axis=1)

    return df

In [25]:
# create dummy vars
object_df = create_dummies(X_train, object_cols)

In [26]:
object_df.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,tip_percentage,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
19,16.97,Female,No,Sun,Dinner,3,0.206,5.66,0,0,0,1,0,0
173,7.25,Male,Yes,Sun,Dinner,2,0.71,3.62,1,1,0,1,0,0
119,12.43,Female,No,Thur,Lunch,2,0.145,6.22,0,0,0,0,1,1
29,21.7,Male,No,Sat,Dinner,2,0.198,10.85,1,0,1,0,0,0
238,32.83,Male,Yes,Sat,Dinner,2,0.036,16.42,1,1,1,0,0,0


In [27]:
object_cols

['sex', 'smoker', 'day', 'time']

In [28]:
object_df = object_df.drop(['sex', 'smoker', 'day', 'time', ], axis=1)

In [29]:
object_df = object_df.drop(['total_bill', 'size', 'tip_percentage', 'price_per_person'], axis=1)

In [30]:
object_df.head()

Unnamed: 0,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
19,0,0,0,1,0,0
173,1,1,0,1,0,0
119,0,0,0,0,1,1
29,1,0,1,0,0,0
238,1,1,1,0,0,0


In [31]:
object_df.dtypes

sex_Male      uint8
smoker_Yes    uint8
day_Sat       uint8
day_Sun       uint8
day_Thur      uint8
time_Lunch    uint8
dtype: object

In [32]:
scaler = MinMaxScaler(copy=True).fit(object_df)


X_train_scaled = scaler.transform(object_df)

In [33]:
X_train_scaled.shape

(136, 6)

In [34]:
y_train.shape

(136,)

In [35]:
f_selector = SelectKBest(f_regression, k=2)

In [36]:
f_selector.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fcee453c940>)

In [37]:
# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

In [38]:
# boolean mask is a name for an array of booleans
feature_mask

array([False, False, False,  True, False,  True])

In [39]:
# get list of top K features. 
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

#### F. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [40]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [None]:
train, validte, test 