## Python - Extensive Feature Selection

In [233]:
import pandas as pd
import numpy as np

data = pd.read_csv("EmployeeChurn.csv")
data.head()

Unnamed: 0,EmployeeAge,Is_Attrite,Travel,Rate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [234]:
my_tab = pd.crosstab(index=data["Is_Attrite"], columns="count")
data.Is_Attrite.value_counts()/len(data)*100

No     83.877551
Yes    16.122449
Name: Is_Attrite, dtype: float64

##### Create categorial responce to numeric type and do some feature engineering

In [235]:
target_map = {"Yes":1,"No":0}
data.Is_Attrite = data.Is_Attrite.map(target_map)

data['bandMonthlyIncome'] = pd.cut(data['MonthlyIncome'], 5)
data[['bandMonthlyIncome', 'Is_Attrite']].groupby(['bandMonthlyIncome'], as_index=False).mean().sort_values(by='bandMonthlyIncome', ascending=True)
data.loc[data['MonthlyIncome'] <= 990.01, 'MonthlyIncome'] = 0
data.loc[(data['MonthlyIncome'] > 990.01) & (data['MonthlyIncome'] <= 4807.0), 'MonthlyIncome'] = 1
data.loc[(data['MonthlyIncome'] > 4807.0) & (data['MonthlyIncome'] <= 8605.0), 'MonthlyIncome'] = 2
data.loc[(data['MonthlyIncome'] > 8605.0) & (data['MonthlyIncome'] <= 12403.0), 'MonthlyIncome'] = 3
data.loc[(data['MonthlyIncome'] > 12403.0) & (data['MonthlyIncome'] <= 16201.0), 'MonthlyIncome'] = 4
data.loc[(data['MonthlyIncome'] > 16201.0) & (data['MonthlyIncome'] <= 19999.0), 'MonthlyIncome'] = 5
data.loc[data['MonthlyIncome'] > 19999, 'MonthlyIncome'] =6
data.drop(['bandMonthlyIncome'],axis = 1,inplace = True)
data.head()

Unnamed: 0,EmployeeAge,Is_Attrite,Travel,Rate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [236]:
import warnings
warnings.filterwarnings('ignore')

data_0 = data.loc[data.Is_Attrite==0].sample(frac = .1, replace = False)
data_1 = data.loc[data.Is_Attrite==1].sample(frac = .1, replace = False)

print("label 0 sample size .1 fraction",str(data_0.shape[0]))
print("label 1 sample size .1 fraction",str(data_1.shape[0]))

label 0 sample size .1 fraction 123
label 1 sample size .1 fraction 24


In [237]:
categorical_list = []
numerical_list = []
for col in data.drop('Is_Attrite',axis = 1).columns.tolist() :
    if data[col].dtype == "O":
        categorical_list.append(col)
    else :
        numerical_list.append(col)
print("Num categorical",len(categorical_list))    
print("Num numerical",len(numerical_list))

Num categorical 8
Num numerical 26


##### As EmployeeNumber is unique so better remove this from dependent variables

In [238]:
data = data.drop(["EmployeeNumber"], axis = 1)
data_final = pd.get_dummies(data, drop_first=True)
data_final.head()

Unnamed: 0,EmployeeAge,Is_Attrite,Rate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1,1102,1,2,1,2,94,3,2,...,0,0,0,0,0,1,0,0,1,1
1,49,0,279,8,1,1,3,61,2,2,...,0,0,0,0,1,0,0,1,0,0
2,37,1,1373,2,2,1,4,92,2,1,...,1,0,0,0,0,0,0,0,1,1
3,33,0,1392,3,4,1,4,56,3,1,...,0,0,0,0,1,0,0,1,0,1
4,27,0,591,2,1,1,1,40,3,1,...,1,0,0,0,0,0,0,1,0,0


#### Create response and prediction set as below

In [239]:
train = data_final.drop(['Is_Attrite'], axis = 1)
test = data_final['Is_Attrite']

In [240]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train, test)
print (x_train.shape, y_train.shape, x_test.shape, y_test.shape)
feature_name = train.columns.tolist()

(1102, 46) (1102,) (368, 46) (368,)


##### Lets check the distiribution of the class

#####  Pearson Correlation

In [241]:
def cor_feature(X,Y) :
    all_core = []
    for col in X.columns.tolist() : 
        coef = np.corrcoef(X[col],Y)[0,1]
        all_core.append(coef)
    all_core = [0 if np.isnan(i) else i for i in all_core]
    cor_feature = X.iloc[:,np.argsort(np.abs(all_core))[-40:]].columns.tolist()
    cor_support = [True if i in cor_feature else False for i in feature_name]
    
    return(cor_feature, cor_support)

In [242]:
cor_feature, cor_support  = cor_feature(x_train,y_train)
print(str(len(cor_feature)), 'selected features')

40 selected features


#####  Chi-2

In [243]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_Norm = MinMaxScaler().fit_transform(x_train)
chi_selector = SelectKBest(chi2, k = 40)
chi_selector.fit(X_Norm, y_train)

SelectKBest(k=40, score_func=<function chi2 at 0x000000000B07FBF8>)

In [244]:
chi_support = chi_selector.get_support()
chi_feature = x_train.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

40 selected features


##### Recursive Feature Elimination

In [245]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

rfe_selector = RFE(estimator = LogisticRegression(), n_features_to_select = 40, step = 4, verbose = 5)
rfe_selector.fit(x_train,y_train)

Fitting estimator with 46 features.
Fitting estimator with 42 features.


RFE(estimator=LogisticRegression(), n_features_to_select=40, step=4, verbose=5)

In [246]:
rfe_feature_support = rfe_selector.get_support()
rfe_selected_feature = x_train.loc[:,rfe_feature_support].columns.tolist()
print(str(len(rfe_selected_feature)), 'Selected features')

40 Selected features


##### LogisticRegression

In [247]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

lr_selector = SelectFromModel(LogisticRegression(penalty = "l1", solver='liblinear'),'1.25*median')
lr_selector.fit(x_train,y_train)

SelectFromModel(estimator=LogisticRegression(penalty='l1', solver='liblinear'),
                threshold='1.25*median')

In [248]:
lr_feature_support = lr_selector.get_support()
lr_selected_feature = x_train.loc[:,lr_feature_support].columns.tolist()
print(str(len(lr_selected_feature)), 'Selected features')

19 Selected features


##### RandomForestClassifier

In [249]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

rf_selector = SelectFromModel(RandomForestClassifier(n_estimators = 100),'1.25*median')
rf_selector.fit(x_train,y_train)

SelectFromModel(estimator=RandomForestClassifier(), threshold='1.25*median')

In [250]:
rf_feature_support = rf_selector.get_support()
rf_selected_feature = x_train.loc[:,rf_feature_support].columns.tolist()
print(str(len(rf_selected_feature)), 'Selected features')

22 Selected features


##### LGBMClassifier

In [251]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

#lgb_model = LGBMClassifier(n_estimator = 500, learning_rate = 0.05, num_leaves = 32, colsample_btree = 0.2,
#                         reg_alpha = 3, reg_lambda = 1, min_split_gain=0.01, min_child_weight=40)

lgb_model = LGBMClassifier(n_estimator = 500, learning_rate = 0.05, num_leaves = 32)
lgb_selector = SelectFromModel(lgb_model,'1.25*median')
lgb_selector.fit(x_train,y_train)

SelectFromModel(estimator=LGBMClassifier(learning_rate=0.05, n_estimator=500,
                                         num_leaves=32),
                threshold='1.25*median')

In [252]:
lg_feature_support = lgb_selector.get_support()
lg_selected_feature = x_train.loc[:,lg_feature_support].columns.tolist()
print(str(len(lg_selected_feature)), 'Selected features')

22 Selected features


In [253]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support,
                                     'RFE':rfe_feature_support, 'Logistics':lr_feature_support,
                                     'Random Forest':rf_feature_support, 'LightGBM':lg_feature_support})

feature_selection_df.shape
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,LightGBM,Total
1,WorkLifeBalance,True,True,True,True,True,True,6
2,StockOptionLevel,True,True,True,True,True,True,6
3,RelationshipSatisfaction,True,True,True,True,True,True,6
4,OverTime_Yes,True,True,True,True,True,True,6
5,JobSatisfaction,True,True,True,True,True,True,6
6,JobLevel,True,True,True,True,True,True,6
7,JobInvolvement,True,True,True,True,True,True,6
8,EnvironmentSatisfaction,True,True,True,True,True,True,6
9,YearsWithCurrManager,True,True,True,False,True,True,5
10,YearsSinceLastPromotion,True,True,True,False,True,True,5
