# Libraries

In [8]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV, f_classif, chi2
from boruta import BorutaPy
from collections import Counter

# Feature Selection
For feature selection, it will be used a majority voting method applying 3 selection proposals:
 - Statistical test
 - RFECV
 - Boruta

Features maintained by at least 2 algorithms will be used in the modeling process.

In [9]:
df_telco = pd.read_csv('df_telco_clean.csv')
df_telco.drop(['customerID'], axis = 1, inplace = True)
df_telco.head(5)

Unnamed: 0,Churn,customer_gender,customer_SeniorCitizen,customer_Partner,customer_Dependents,customer_tenure,phone_PhoneService,phone_MultipleLines,internet_InternetService,internet_OnlineSecurity,...,internet_DeviceProtection,internet_TechSupport,internet_StreamingTV,internet_StreamingMovies,account_Contract,account_PaperlessBilling,account_PaymentMethod,account_Charges_Monthly,account_Charges_Total,account_Charges_Daily
0,0,Female,< 65 years,Yes,Yes,9,Yes,No,DSL,No,...,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3,2.19
1,0,Male,< 65 years,No,No,9,Yes,Yes,DSL,No,...,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4,2.0
2,1,Male,< 65 years,No,No,4,Yes,No,Fiber optic,No,...,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85,2.46
3,1,Male,>= 65 years,Yes,No,13,Yes,No,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85,3.27
4,1,Female,>= 65 years,Yes,No,3,Yes,No,Fiber optic,No,...,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4,2.8


In [10]:
for column in df_telco:
    print( column, df_telco[column].unique())

Churn [0 1]
customer_gender ['Female' 'Male']
customer_SeniorCitizen ['< 65 years' '>= 65 years']
customer_Partner ['Yes' 'No']
customer_Dependents ['Yes' 'No']
customer_tenure [ 9  4 13  3 71 63  7 65 54 72  5 56 34  1 45 50 23 55 26 69 37 49 66 67
 20 43 59 12 27  2 25 29 14 35 64 39 40 11  6 30 70 57 58 16 32 33 10 21
 61 15 44 22 24 19 47 62 46 52  8 60 48 28 41 53 68 31 36 17 18 51 38 42]
phone_PhoneService ['Yes' 'No']
phone_MultipleLines ['No' 'Yes' 'No phone service']
internet_InternetService ['DSL' 'Fiber optic' 'No']
internet_OnlineSecurity ['No' 'Yes' 'No internet service']
internet_OnlineBackup ['Yes' 'No' 'No internet service']
internet_DeviceProtection ['No' 'Yes' 'No internet service']
internet_TechSupport ['Yes' 'No' 'No internet service']
internet_StreamingTV ['Yes' 'No' 'No internet service']
internet_StreamingMovies ['No' 'Yes' 'No internet service']
account_Contract ['One year' 'Month-to-month' 'Two year']
account_PaperlessBilling ['Yes' 'No']
account_PaymentMethod 

In [11]:
replace_dict = {'No':0, 'Yes':1,
                'Male':0,'Female':1, 
                '< 65 years' : 0, '>= 65 years' : 1,
                'No phone service': 0,
                'No internet service': 0}

In [12]:
df_telco = df_telco.replace(replace_dict)
df_telco.head()

Unnamed: 0,Churn,customer_gender,customer_SeniorCitizen,customer_Partner,customer_Dependents,customer_tenure,phone_PhoneService,phone_MultipleLines,internet_InternetService,internet_OnlineSecurity,...,internet_DeviceProtection,internet_TechSupport,internet_StreamingTV,internet_StreamingMovies,account_Contract,account_PaperlessBilling,account_PaymentMethod,account_Charges_Monthly,account_Charges_Total,account_Charges_Daily
0,0,1,0,1,1,9,1,0,DSL,0,...,0,1,1,0,One year,1,Mailed check,65.6,593.3,2.19
1,0,0,0,0,0,9,1,1,DSL,0,...,0,0,0,1,Month-to-month,0,Mailed check,59.9,542.4,2.0
2,1,0,0,0,0,4,1,0,Fiber optic,0,...,1,0,0,0,Month-to-month,1,Electronic check,73.9,280.85,2.46
3,1,0,1,1,0,13,1,0,Fiber optic,0,...,1,0,1,1,Month-to-month,1,Electronic check,98.0,1237.85,3.27
4,1,1,1,1,0,3,1,0,Fiber optic,0,...,0,1,1,0,Month-to-month,1,Mailed check,83.9,267.4,2.8


In [13]:
# Getting dummies
df_telco_dummies = df_telco.copy()
df_telco_dummies = pd.get_dummies(df_telco_dummies[['internet_InternetService', 'account_Contract', 'account_PaymentMethod']])

df_telco.drop(['internet_InternetService', 'account_Contract', 'account_PaymentMethod'], axis = 1, inplace = True)

df_telco_dummies = pd.concat([df_telco_dummies, df_telco], axis = 1)

In [14]:
df_telco_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 28 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   internet_InternetService_0                       7032 non-null   uint8  
 1   internet_InternetService_DSL                     7032 non-null   uint8  
 2   internet_InternetService_Fiber optic             7032 non-null   uint8  
 3   account_Contract_Month-to-month                  7032 non-null   uint8  
 4   account_Contract_One year                        7032 non-null   uint8  
 5   account_Contract_Two year                        7032 non-null   uint8  
 6   account_PaymentMethod_Bank transfer (automatic)  7032 non-null   uint8  
 7   account_PaymentMethod_Credit card (automatic)    7032 non-null   uint8  
 8   account_PaymentMethod_Electronic check           7032 non-null   uint8  
 9   account_PaymentMethod_Mailed c

In [118]:
for column in df_telco_dummies:
    print( column, df_telco_dummies[column].unique())

internet_InternetService_0 [0 1]
internet_InternetService_DSL [1 0]
internet_InternetService_Fiber optic [0 1]
account_Contract_Month-to-month [0 1]
account_Contract_One year [1 0]
account_Contract_Two year [0 1]
account_PaymentMethod_Bank transfer (automatic) [0 1]
account_PaymentMethod_Credit card (automatic) [0 1]
account_PaymentMethod_Electronic check [0 1]
account_PaymentMethod_Mailed check [1 0]
Churn [0 1]
customer_gender [1 0]
customer_SeniorCitizen [0 1]
customer_Partner [1 0]
customer_Dependents [1 0]
customer_tenure [ 9  4 13  3 71 63  7 65 54 72  5 56 34  1 45 50 23 55 26 69 37 49 66 67
 20 43 59 12 27  2 25 29 14 35 64 39 40 11  6 30 70 57 58 16 32 33 10 21
 61 15 44 22 24 19 47 62 46 52  8 60 48 28 41 53 68 31 36 17 18 51 38 42]
phone_PhoneService [1 0]
phone_MultipleLines [0 1]
internet_OnlineSecurity [0 1]
internet_OnlineBackup [1 0]
internet_DeviceProtection [0 1]
internet_TechSupport [1 0]
internet_StreamingTV [1 0]
internet_StreamingMovies [0 1]
account_PaperlessBill

In [28]:
# new feature
df_telco_dummies['account_Charges_Monthly_Average'] = df_telco_dummies['account_Charges_Total']/df_telco_dummies['customer_tenure']
df_telco_dummies.drop(['account_Charges_Total', 'account_Charges_Daily'], axis = 1, inplace = True)
df_telco_dummies

Unnamed: 0,internet_InternetService_0,internet_InternetService_DSL,internet_InternetService_Fiber optic,account_Contract_Month-to-month,account_Contract_One year,account_Contract_Two year,account_PaymentMethod_Bank transfer (automatic),account_PaymentMethod_Credit card (automatic),account_PaymentMethod_Electronic check,account_PaymentMethod_Mailed check,...,phone_MultipleLines,internet_OnlineSecurity,internet_OnlineBackup,internet_DeviceProtection,internet_TechSupport,internet_StreamingTV,internet_StreamingMovies,account_PaperlessBilling,account_Charges_Monthly,account_Charges_Monthly_Average
0,0,1,0,0,1,0,0,0,0,1,...,0,0,1,0,1,1,0,1,65.60,65.922222
1,0,1,0,1,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,59.90,60.266667
2,0,0,1,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,73.90,70.212500
3,0,0,1,1,0,0,0,0,1,0,...,0,0,1,1,0,1,1,1,98.00,95.219231
4,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,1,1,0,1,83.90,89.133333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,1,0,0,1,0,0,0,0,1,...,0,1,0,0,1,0,0,0,55.15,57.146154
7028,0,0,1,1,0,0,0,0,1,0,...,1,0,0,0,0,0,1,1,85.10,85.168182
7029,0,1,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,1,50.30,46.375000
7030,0,1,0,0,0,1,0,0,0,1,...,0,1,0,1,1,0,1,0,67.85,69.069403


In [29]:
df_telco_dummies.columns

Index(['internet_InternetService_0', 'internet_InternetService_DSL',
       'internet_InternetService_Fiber optic',
       'account_Contract_Month-to-month', 'account_Contract_One year',
       'account_Contract_Two year',
       'account_PaymentMethod_Bank transfer (automatic)',
       'account_PaymentMethod_Credit card (automatic)',
       'account_PaymentMethod_Electronic check',
       'account_PaymentMethod_Mailed check', 'Churn', 'customer_gender',
       'customer_SeniorCitizen', 'customer_Partner', 'customer_Dependents',
       'customer_tenure', 'phone_PhoneService', 'phone_MultipleLines',
       'internet_OnlineSecurity', 'internet_OnlineBackup',
       'internet_DeviceProtection', 'internet_TechSupport',
       'internet_StreamingTV', 'internet_StreamingMovies',
       'account_PaperlessBilling', 'account_Charges_Monthly',
       'account_Charges_Monthly_Average'],
      dtype='object')

In [30]:
# Split target and features
X = df_telco_dummies.drop('Churn',axis=1)
y = df_telco_dummies['Churn']

In [31]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

In [41]:
# Categorical features and numerical features
list_cat = ['internet_InternetService_0', 'internet_InternetService_DSL',
       'internet_InternetService_Fiber optic',
       'account_Contract_Month-to-month', 'account_Contract_One year',
       'account_Contract_Two year',
       'account_PaymentMethod_Bank transfer (automatic)',
       'account_PaymentMethod_Credit card (automatic)',
       'account_PaymentMethod_Electronic check',
       'account_PaymentMethod_Mailed check', 'customer_gender',
       'customer_SeniorCitizen', 'customer_Partner', 'customer_Dependents',
        'phone_PhoneService', 'phone_MultipleLines',
       'internet_OnlineSecurity', 'internet_OnlineBackup',
       'internet_DeviceProtection', 'internet_TechSupport',
       'internet_StreamingTV', 'internet_StreamingMovies',
       'account_PaperlessBilling']
list_num = ['account_Charges_Monthly',
       'account_Charges_Monthly_Average', 'customer_tenure']

In [44]:
#Spliting categorical and numerical features
X_train_num = X_train[list_num]
X_train_cat = X_train[list_cat]

## Statistical test 

### ANOVA

In [75]:
selected_anova = f_classif(X_train_num, y_train)
selected_anova

(array([193.20019175, 190.21318072, 747.28472418]),
 array([4.12033707e-043, 1.74763451e-042, 2.67223519e-153]))

In [76]:
# p values
p_values_num = pd.Series(selected_anova[1])

p_values_num.index = X_train_num.columns 
p_values_num.sort_values(ascending=True, inplace=True)

p_values_num

customer_tenure                    2.672235e-153
account_Charges_Monthly             4.120337e-43
account_Charges_Monthly_Average     1.747635e-42
dtype: float64

In [77]:
# features p_values_num<0.05
p_values_num = p_values_num[p_values_num<0.05]

p_values_num.index

Index(['customer_tenure', 'account_Charges_Monthly',
       'account_Charges_Monthly_Average'],
      dtype='object')

In [78]:
X_train_num_anova = X_train_num.loc[:,p_values_num.index]
X_train_num_anova.head(5)

Unnamed: 0,customer_tenure,account_Charges_Monthly,account_Charges_Monthly_Average
463,53,105.55,107.212264
5817,1,20.25,20.25
1432,1,21.1,21.1
2885,71,106.8,107.369014
3906,46,24.9,25.53913


### Chi2

In [79]:
# Chi2 for categorical features
selected_chi2 = chi2(X_train_cat, y_train)
selected_chi2

(array([1.95144257e+02, 5.41087820e+01, 2.70749006e+02, 3.82928267e+02,
        1.36416157e+02, 3.46822165e+02, 5.86336900e+01, 6.74617013e+01,
        2.91438037e+02, 2.39516980e+01, 9.27203718e-02, 1.17697820e+02,
        6.46853714e+01, 8.88003077e+01, 2.30345524e-01, 6.92255279e+00,
        1.12358374e+02, 2.05336382e+01, 1.35877355e+01, 9.58685151e+01,
        8.46385460e+00, 8.47311603e+00, 7.31707778e+01]),
 array([2.39622465e-44, 1.89691561e-13, 7.79233155e-61, 2.86706696e-85,
        1.61803488e-31, 2.08529264e-77, 1.89936131e-14, 2.14819759e-16,
        2.41670526e-65, 9.87831293e-07, 7.60746840e-01, 2.01922309e-27,
        8.78647669e-16, 4.36747997e-21, 6.31267746e-01, 8.51153866e-03,
        2.98224702e-26, 5.85924731e-06, 2.27668112e-04, 1.22772435e-22,
        3.62273308e-03, 3.60433480e-03, 1.18903234e-17]))

In [80]:
#p values
p_values_cat = pd.Series(selected_chi2[1])

p_values_cat.index = X_train_cat.columns 
p_values_cat.sort_values(ascending=True, inplace=True)

In [81]:
p_values_cat = p_values_cat[p_values_cat<0.05]
p_values_cat

account_Contract_Month-to-month                    2.867067e-85
account_Contract_Two year                          2.085293e-77
account_PaymentMethod_Electronic check             2.416705e-65
internet_InternetService_Fiber optic               7.792332e-61
internet_InternetService_0                         2.396225e-44
account_Contract_One year                          1.618035e-31
customer_SeniorCitizen                             2.019223e-27
internet_OnlineSecurity                            2.982247e-26
internet_TechSupport                               1.227724e-22
customer_Dependents                                4.367480e-21
account_PaperlessBilling                           1.189032e-17
account_PaymentMethod_Credit card (automatic)      2.148198e-16
customer_Partner                                   8.786477e-16
account_PaymentMethod_Bank transfer (automatic)    1.899361e-14
internet_InternetService_DSL                       1.896916e-13
account_PaymentMethod_Mailed check      

In [82]:
X_train_cat_chi2 = X_train_cat[p_values_cat.index]

In [83]:
#X_train filtered statistic
X_train_st = pd.concat([X_train_num_anova, X_train_cat_chi2], axis=1)

cols_keep_st = list(X_train_st.columns)
cols_keep_st

['customer_tenure',
 'account_Charges_Monthly',
 'account_Charges_Monthly_Average',
 'account_Contract_Month-to-month',
 'account_Contract_Two year',
 'account_PaymentMethod_Electronic check',
 'internet_InternetService_Fiber optic',
 'internet_InternetService_0',
 'account_Contract_One year',
 'customer_SeniorCitizen',
 'internet_OnlineSecurity',
 'internet_TechSupport',
 'customer_Dependents',
 'account_PaperlessBilling',
 'account_PaymentMethod_Credit card (automatic)',
 'customer_Partner',
 'account_PaymentMethod_Bank transfer (automatic)',
 'internet_InternetService_DSL',
 'account_PaymentMethod_Mailed check',
 'internet_OnlineBackup',
 'internet_DeviceProtection',
 'internet_StreamingMovies',
 'internet_StreamingTV',
 'phone_MultipleLines']

## Recursive Feature Elimination

In [84]:
# Target imbalanced
y_train.value_counts(normalize = True)

0    0.733442
1    0.266558
Name: Churn, dtype: float64

In [85]:
forest = RandomForestClassifier(n_jobs = -1, max_depth = 5, n_estimators= 10, random_state=123, class_weight= 'balanced')

In [86]:
# Selection (Random Forest)
rfecv_RFC = RFECV(estimator=forest, scoring='precision')
rfecv_RFC.fit(X_train,y_train)

RFECV(estimator=RandomForestClassifier(class_weight='balanced', max_depth=5,
                                       n_estimators=10, n_jobs=-1,
                                       random_state=123),
      scoring='precision')

In [87]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [88]:
print(rfecv_RFC.n_features_) # number of most important features
print(rfecv_RFC.support_) # Booleans vector. True if the variable is to be kept and False otherwise
mask_RFC = rfecv_RFC.support_

rfecv_RFC.ranking_

19
[ True  True  True  True  True  True False False  True False  True  True
 False  True  True False False  True  True  True False  True  True  True
  True  True]


array([1, 1, 1, 1, 1, 1, 7, 3, 1, 4, 1, 1, 8, 1, 1, 6, 5, 1, 1, 1, 2, 1,
       1, 1, 1, 1])

In [89]:
cols_drop_RFE= [not x for x in mask_RFC] # inverting boolean vector because is mask
cols_drop_RFE= X_train.loc[:,cols_drop_RFE].columns.tolist()

cols_drop_RFE

['account_PaymentMethod_Bank transfer (automatic)',
 'account_PaymentMethod_Credit card (automatic)',
 'account_PaymentMethod_Mailed check',
 'customer_Partner',
 'phone_PhoneService',
 'phone_MultipleLines',
 'internet_TechSupport']

In [90]:
cols_keep_RFE= mask_RFC 
cols_keep_RFE= X_train.loc[:,cols_keep_RFE].columns.tolist()
cols_keep_RFE

['internet_InternetService_0',
 'internet_InternetService_DSL',
 'internet_InternetService_Fiber optic',
 'account_Contract_Month-to-month',
 'account_Contract_One year',
 'account_Contract_Two year',
 'account_PaymentMethod_Electronic check',
 'customer_gender',
 'customer_SeniorCitizen',
 'customer_Dependents',
 'customer_tenure',
 'internet_OnlineSecurity',
 'internet_OnlineBackup',
 'internet_DeviceProtection',
 'internet_StreamingTV',
 'internet_StreamingMovies',
 'account_PaperlessBilling',
 'account_Charges_Monthly',
 'account_Charges_Monthly_Average']

In [142]:
X_train_RFECV = X_train.loc[:,cols_keep_RFE]

X_train_RFECV

Unnamed: 0,internet_InternetService_0,internet_InternetService_DSL,internet_InternetService_Fiber optic,account_Contract_Month-to-month,account_Contract_One year,account_Contract_Two year,account_PaymentMethod_Electronic check,customer_gender,customer_SeniorCitizen,customer_Dependents,customer_tenure,internet_OnlineSecurity,internet_OnlineBackup,internet_DeviceProtection,internet_StreamingTV,internet_StreamingMovies,account_PaperlessBilling,account_Charges_Monthly,account_Charges_Monthly_Average
463,0,0,1,0,0,1,1,1,0,0,53,0,1,1,1,1,0,105.55,107.212264
5817,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,20.25,20.250000
1432,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,21.10,21.100000
2885,0,0,1,0,0,1,1,0,1,0,71,0,1,0,1,1,1,106.80,107.369014
3906,1,0,0,0,1,0,1,0,0,1,46,0,0,0,0,0,0,24.90,25.539130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5218,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,45.75,45.750000
4060,1,0,0,0,0,1,0,0,0,0,71,0,0,0,0,0,0,25.55,26.733803
1346,0,0,1,1,0,0,1,0,0,0,66,0,1,1,1,1,0,99.50,101.674242
3454,0,0,1,0,0,1,0,0,1,0,72,0,1,1,1,1,0,109.70,109.700694


### Boruta

In [94]:
# fit boruta
boruta_selector = BorutaPy(forest, n_estimators = 50, max_iter=100, random_state = 0)
boruta_selector.fit(np.array(X_train), np.array(y_train))

BorutaPy(estimator=RandomForestClassifier(class_weight='balanced', max_depth=5,
                                          n_estimators=50, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x215339368C8),
         n_estimators=50, random_state=RandomState(MT19937) at 0x215339368C8)

In [95]:
boruta_selector.get_params().keys()

dict_keys(['alpha', 'estimator__bootstrap', 'estimator__ccp_alpha', 'estimator__class_weight', 'estimator__criterion', 'estimator__max_depth', 'estimator__max_features', 'estimator__max_leaf_nodes', 'estimator__max_samples', 'estimator__min_impurity_decrease', 'estimator__min_impurity_split', 'estimator__min_samples_leaf', 'estimator__min_samples_split', 'estimator__min_weight_fraction_leaf', 'estimator__n_estimators', 'estimator__n_jobs', 'estimator__oob_score', 'estimator__random_state', 'estimator__verbose', 'estimator__warm_start', 'estimator', 'max_iter', 'n_estimators', 'perc', 'random_state', 'two_step', 'verbose'])

In [96]:
boruta_selector.support_.tolist()

[True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 True]

In [97]:
# features selected by boruta
cols_drop_boruta= [not x for x in boruta_selector.support_.tolist()] # inverting boolean vector
cols_drop_boruta= X_train.loc[:,cols_drop_boruta].columns.tolist()

cols_drop_boruta

['account_PaymentMethod_Bank transfer (automatic)',
 'account_PaymentMethod_Credit card (automatic)',
 'account_PaymentMethod_Mailed check',
 'customer_gender',
 'customer_SeniorCitizen',
 'customer_Partner',
 'customer_Dependents',
 'phone_PhoneService',
 'phone_MultipleLines',
 'internet_OnlineBackup',
 'internet_DeviceProtection',
 'internet_StreamingTV',
 'internet_StreamingMovies']

In [98]:
cols_keep_boruta=  boruta_selector.support_.tolist()
cols_keep_boruta= X_train.loc[:,cols_keep_boruta].columns.tolist()

cols_keep_boruta

['internet_InternetService_0',
 'internet_InternetService_DSL',
 'internet_InternetService_Fiber optic',
 'account_Contract_Month-to-month',
 'account_Contract_One year',
 'account_Contract_Two year',
 'account_PaymentMethod_Electronic check',
 'customer_tenure',
 'internet_OnlineSecurity',
 'internet_TechSupport',
 'account_PaperlessBilling',
 'account_Charges_Monthly',
 'account_Charges_Monthly_Average']

## Vote for features selected in the feature selection

In [99]:
#Counting votes
counts = Counter(sum(([list(set(i)) for i in (cols_keep_st, cols_keep_RFE, cols_keep_boruta)]), []))
print('Vote',counts)

# Model Dataframe
def df_feature_selected(X, y):
  #df with majority features (2 or more votes)
  majority_votes = [i for i, c in counts.items() if c >= 2]
  df_model = pd.DataFrame(X.loc[:,majority_votes])
  df_model['Churn'] = y

  return df_model

Vote Counter({'internet_InternetService_DSL': 3, 'internet_InternetService_Fiber optic': 3, 'customer_tenure': 3, 'account_PaperlessBilling': 3, 'account_Contract_Two year': 3, 'internet_OnlineSecurity': 3, 'account_PaymentMethod_Electronic check': 3, 'account_Contract_One year': 3, 'account_Contract_Month-to-month': 3, 'internet_InternetService_0': 3, 'account_Charges_Monthly': 3, 'account_Charges_Monthly_Average': 3, 'customer_Dependents': 2, 'internet_StreamingTV': 2, 'customer_SeniorCitizen': 2, 'internet_StreamingMovies': 2, 'internet_DeviceProtection': 2, 'internet_OnlineBackup': 2, 'internet_TechSupport': 2, 'account_PaymentMethod_Credit card (automatic)': 1, 'account_PaymentMethod_Mailed check': 1, 'account_PaymentMethod_Bank transfer (automatic)': 1, 'phone_MultipleLines': 1, 'customer_Partner': 1, 'customer_gender': 1})


In [100]:
df_feature_selected(df_telco_dummies.drop(['Churn'],axis = 1), df_telco_dummies['Churn'] )

Unnamed: 0,customer_Dependents,internet_InternetService_DSL,internet_StreamingTV,internet_InternetService_Fiber optic,customer_tenure,account_PaperlessBilling,customer_SeniorCitizen,account_Contract_Two year,internet_OnlineSecurity,internet_StreamingMovies,internet_DeviceProtection,account_PaymentMethod_Electronic check,account_Contract_One year,account_Contract_Month-to-month,internet_InternetService_0,internet_OnlineBackup,account_Charges_Monthly,account_Charges_Monthly_Average,internet_TechSupport,Churn
0,1,1,1,0,9,1,0,0,0,0,0,0,1,0,0,1,65.60,65.922222,1,0
1,0,1,0,0,9,0,0,0,0,1,0,0,0,1,0,0,59.90,60.266667,0,0
2,0,0,0,1,4,1,0,0,0,0,1,1,0,1,0,0,73.90,70.212500,0,1
3,0,0,1,1,13,1,1,0,0,1,1,1,0,1,0,1,98.00,95.219231,0,1
4,0,0,1,1,3,1,1,0,0,0,0,0,0,1,0,0,83.90,89.133333,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,1,0,0,13,0,0,0,1,0,0,0,1,0,0,0,55.15,57.146154,1,0
7028,0,0,0,1,22,1,0,0,0,1,0,1,0,1,0,0,85.10,85.168182,0,1
7029,0,1,0,0,2,1,0,0,0,0,0,0,0,1,0,1,50.30,46.375000,0,0
7030,1,1,0,0,67,0,0,1,1,1,1,0,0,0,0,0,67.85,69.069403,1,0


In [151]:
df_feature_selected(df_telco_dummies.drop(['Churn'],axis = 1), df_telco_dummies['Churn'] 
                   ).to_csv('df_model.csv', index = False)

In [101]:
df_telco_dummies.to_csv('df_model_1.csv', index = False)