## Feature Engineering

In this file, I will use feature engineering to select the best predictors for the outcome on whether the customer is "great" or not. I will use a variety of criteria and choose the the features with top votes. 

### Load Libraries

In [4]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [18]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

### Load Data

In [9]:
df = pd.read_csv("cleaned.csv")
df.head()

Unnamed: 0,workclass,marital-status,occupation,race,sex,user_id,age,salary,education_rank,mins_beerdrinking_year,mins_exercising_year,works_hours,tea_per_year,coffee_per_year,great_customer_class
0,private,Never-married,sales,not_caucasian,Male,1004889.0,14.0,70773.0,9.0,0.0,0.0,40.0,399.0,359.708169,0.0
1,private,Divorced,sales,caucasian,Female,1012811.0,25.0,76597.0,9.0,0.0,0.0,30.0,256.0,359.708169,0.0
2,private,Never-married,clerical,caucasian,Female,1006870.0,21.0,47947.25,10.0,0.0,0.0,10.0,442.0,276.0,0.0
3,private,Divorced,sales,caucasian,Female,1022149.0,23.0,41740.25,7.0,0.0,0.0,20.0,217.276544,359.708169,0.0
4,private,Married,sales,not_caucasian,Male,1029558.0,26.0,37149.297355,9.0,447.920607,0.0,36.0,217.276544,120.0,0.0


In [10]:
cat_df = df[['workclass', 'marital-status','occupation','race', 'sex']]
num_df = df[['user_id', 'age', 'salary',
       'education_rank', 'mins_beerdrinking_year', 'mins_exercising_year', 
            'works_hours', 'tea_per_year', 'coffee_per_year', 'great_customer_class']]

### Data Processing 

In [11]:
label_encoder = LabelEncoder()

In [12]:
cat_df['workclass']= label_encoder.fit_transform(cat_df['workclass'])
cat_df['marital-status']= label_encoder.fit_transform(cat_df['marital-status'])
cat_df['occupation']= label_encoder.fit_transform(cat_df['occupation']) 
cat_df['race']= label_encoder.fit_transform(cat_df['race'])
cat_df['sex']= label_encoder.fit_transform(cat_df['sex'])
cat_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

Unnamed: 0,workclass,marital-status,occupation,race,sex
0,1,2,9,1,1
1,1,0,9,0,0
2,1,2,1,0,0
3,1,0,9,0,0
4,1,1,9,1,1
...,...,...,...,...,...
13594,1,1,4,0,1
13595,2,1,9,0,1
13596,2,1,4,0,1
13597,1,0,4,0,1


### Remove Useless Columns

In [13]:
num_df = num_df.drop('great_customer_class', axis=1)
num_df

Unnamed: 0,user_id,age,salary,education_rank,mins_beerdrinking_year,mins_exercising_year,works_hours,tea_per_year,coffee_per_year
0,1004889.0,14.0,70773.000000,9.0,0.000000,0.000000,40.0,399.000000,359.708169
1,1012811.0,25.0,76597.000000,9.0,0.000000,0.000000,30.0,256.000000,359.708169
2,1006870.0,21.0,47947.250000,10.0,0.000000,0.000000,10.0,442.000000,276.000000
3,1022149.0,23.0,41740.250000,7.0,0.000000,0.000000,20.0,217.276544,359.708169
4,1029558.0,26.0,37149.297355,9.0,447.920607,0.000000,36.0,217.276544,120.000000
...,...,...,...,...,...,...,...,...,...
13594,1016807.0,42.0,55293.000000,13.0,0.000000,46.034224,40.0,277.000000,268.000000
13595,1038859.0,58.0,25928.250000,14.0,0.000000,0.000000,40.0,337.000000,359.708169
13596,1041214.0,75.0,16590.000000,7.0,447.920607,0.000000,35.0,217.276544,359.708169
13597,1038013.0,45.0,25536.750000,11.0,0.000000,0.000000,40.0,99.000000,79.000000


In [14]:
y = df.great_customer_class
X = pd.concat([cat_df, num_df], axis=1)
X = X.drop("user_id", axis = 1)

### Look at Top 10 Features

In [17]:
feature_name = list(X.columns)
num_feats=10

### Build Selectors

In [20]:
def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

def embedded_rf_selector(X, y, num_feats):
    embedded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators = 100), 
                                          max_features = num_feats)
    embedded_rf_selector.fit(X, y)
    embedded_rf_support = embedded_rf_selector.get_support()
    embedded_rf_feature = X.loc[:, embedded_rf_support].columns.tolist()
    return embedded_rf_support, embedded_rf_feature

def rfe_selector(X, y, num_feats):
    X_norm =  MinMaxScaler().fit_transform(X)
    rfe_selector = RFE(estimator = LogisticRegression(), n_features_to_select = num_feats, 
                      step = 10, 
                      verbose = 5)
    rfe_selector.fit(X_norm, y)
    rfe_support = rfe_selector.get_support()
    rfe_feature = X.loc[:, rfe_support].columns.tolist()
    return rfe_support, rfe_feature

def chi_squared_selector(X, y, num_feats):
    X_norm =  MinMaxScaler().fit_transform(X)
    chi_selector = SelectKBest(chi2, k = num_feats)
    chi_selector.fit(X_norm, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.loc[:,chi_support].columns.tolist()
    return chi_support, chi_feature

def embedded_log_reg_selector(X, y, num_feats):
    X_norm =  MinMaxScaler().fit_transform(X)
    embedded_lr_selector = SelectFromModel(LogisticRegression(penalty = "l2"), max_features = num_feats)
    embedded_lr_selector.fit(X_norm, y)
    embedded_lr_support = embedded_lr_selector.get_support()
    embedded_lr_feature = X.loc[:, embedded_lr_support].columns.tolist()
    return embedded_lr_support, embedded_lr_feature

In [21]:
cor_support, cor_feature = cor_selector(X, y,num_feats=10)
embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
rfe_support, rfe_feature = rfe_selector(X, y,num_feats)
chi_support, chi_feature = chi_squared_selector(X, y,num_feats)
embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)

Fitting estimator with 13 features.


### Rank All Features and Choose the Best Ones

In [22]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embedded_lr_support,
                                    'Random Forest':embedded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,Total
1,salary,True,True,True,True,True,5
2,mins_beerdrinking_year,True,True,True,True,True,5
3,works_hours,True,True,True,True,False,4
4,mins_exercising_year,True,True,True,True,False,4
5,education_rank,True,False,True,True,True,4
6,coffee_per_year,True,True,True,False,True,4
7,workclass,True,True,True,False,False,3
8,tea_per_year,True,True,False,False,True,3
9,occupation,True,True,True,False,False,3
10,marital-status,True,True,False,False,True,3


In [25]:
X = X[["salary", 'mins_beerdrinking_year', "works_hours", "mins_exercising_year", "education_rank", "coffee_per_year"]]

### Save to csv

In [28]:
df = pd.concat([X, y], axis=1)

In [29]:
df

Unnamed: 0,salary,mins_beerdrinking_year,works_hours,mins_exercising_year,education_rank,coffee_per_year,great_customer_class
0,70773.0,0.0,40.0,0.0,9.0,359.708169,0.0
1,76597.0,0.0,30.0,0.0,9.0,359.708169,0.0
2,47947.25,0.0,10.0,0.0,10.0,276.0,0.0
3,41740.25,0.0,20.0,0.0,7.0,359.708169,0.0
4,37149.297355,447.920607,36.0,0.0,9.0,120.0,0.0
5,59060.5,0.0,30.0,0.0,15.0,377.0,0.0
6,56400.75,0.0,40.0,0.0,5.0,98.0,0.0
7,11329.25,0.0,40.0,0.0,10.0,276.0,0.0
8,21850.5,0.0,40.0,0.0,10.0,359.708169,0.0
9,55331.0,0.0,40.0,0.0,9.0,77.0,0.0


In [30]:
df.to_csv('cleaned_v2.csv', index=False)