In [1]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.preprocessing import StandardScaler

In [2]:
# Loading the data into a pandas df
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
# Changing the TotalCharges column from string to numerical
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"],errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df["TotalCharges"].head()

0      29.85
1    1889.50
2     108.15
3    1840.75
4     151.65
Name: TotalCharges, dtype: float64

In [7]:
df["TotalCharges"].isnull().sum()

11

In [8]:
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [9]:
# Fill Null values in the TotalCharges column with zero 
df["TotalCharges"].fillna(0, inplace = True)
df["TotalCharges"].isnull().sum()

0

In [10]:
df["Churn"].unique()

array(['No', 'Yes'], dtype=object)

In [11]:
# Replacing "Yes" and "No" in the Churn columnn with 1 and 0 respectively
df = df.replace(to_replace=["No","Yes"], value=[0,1])
df["Churn"].unique()

array([0, 1], dtype=int64)

In [12]:
df.drop("customerID", axis= 1, inplace=True)

In [13]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,1,0,1,0,No phone service,DSL,0,1,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0
1,Male,0,0,0,34,1,0,DSL,1,0,1,0,0,0,One year,0,Mailed check,56.95,1889.5,0
2,Male,0,0,0,2,1,0,DSL,1,1,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1
3,Male,0,0,0,45,0,No phone service,DSL,1,0,1,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,0,0,2,1,0,Fiber optic,0,0,0,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,1


In [14]:
# Separating features and labels
X = df.drop("Churn", axis=1)
y = df["Churn"]
X.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,1,0,1,0,No phone service,DSL,0,1,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85
1,Male,0,0,0,34,1,0,DSL,1,0,1,0,0,0,One year,0,Mailed check,56.95,1889.5
2,Male,0,0,0,2,1,0,DSL,1,1,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15
3,Male,0,0,0,45,0,No phone service,DSL,1,0,1,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75
4,Female,0,0,0,2,1,0,Fiber optic,0,0,0,0,0,0,Month-to-month,1,Electronic check,70.7,151.65


In [15]:
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [16]:
X.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [17]:
# Grouping the categorical and numerical variables
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [18]:
X_num = X[numerical]
X_num.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.5
2,2,53.85,108.15
3,45,42.3,1840.75
4,2,70.7,151.65


In [19]:
X_cat = X[categorical]
X_cat.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
0,Female,0,1,0,0,No phone service,DSL,0,1,0,0,0,0,Month-to-month,1,Electronic check
1,Male,0,0,0,1,0,DSL,1,0,1,0,0,0,One year,0,Mailed check
2,Male,0,0,0,1,0,DSL,1,1,0,0,0,0,Month-to-month,1,Mailed check
3,Male,0,0,0,0,No phone service,DSL,1,0,1,1,0,0,One year,0,Bank transfer (automatic)
4,Female,0,0,0,1,0,Fiber optic,0,0,0,0,0,0,Month-to-month,1,Electronic check


### OneHot Encoding for the categorical features

In [20]:
X_cat = pd.get_dummies(X_cat, columns= categorical)

In [21]:
X_cat.head()

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_0,Partner_1,Dependents_0,Dependents_1,PhoneService_0,PhoneService_1,...,StreamingMovies_No internet service,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_0,PaperlessBilling_1,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,True,False,True,False,False,True,True,False,True,False,...,False,True,False,False,False,True,False,False,True,False
1,False,True,True,False,True,False,True,False,False,True,...,False,False,True,False,True,False,False,False,False,True
2,False,True,True,False,True,False,True,False,False,True,...,False,True,False,False,False,True,False,False,False,True
3,False,True,True,False,True,False,True,False,True,False,...,False,False,True,False,True,False,True,False,False,False
4,True,False,True,False,True,False,True,False,False,True,...,False,True,False,False,False,True,False,False,True,False


In [22]:
#put the scaled sets into a dataframe
df_cat = pd.DataFrame(X_cat, columns = X_cat.columns)

### Standard scaling for the numerical features

In [23]:
# Initializing the standard scaler
scaler = StandardScaler()
scaler.fit(X_num)
scaled_X_num = scaler.transform(X_num)

In [24]:
#put the scaled sets into a dataframe
df_num = pd.DataFrame(scaled_X_num, columns = X_num.columns)

In [25]:
df_num.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.992611
1,0.066327,-0.259629,-0.172165
2,-1.236724,-0.36266,-0.958066
3,0.514251,-0.746535,-0.193672
4,-1.236724,0.197365,-0.938874


In [26]:
df1 = pd.concat([df_num, df_cat], axis=1)
df1.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_0,Partner_1,Dependents_0,...,StreamingMovies_No internet service,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_0,PaperlessBilling_1,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.992611,True,False,True,False,False,True,True,...,False,True,False,False,False,True,False,False,True,False
1,0.066327,-0.259629,-0.172165,False,True,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,True
2,-1.236724,-0.36266,-0.958066,False,True,True,False,True,False,True,...,False,True,False,False,False,True,False,False,False,True
3,0.514251,-0.746535,-0.193672,False,True,True,False,True,False,True,...,False,False,True,False,True,False,True,False,False,False
4,-1.236724,0.197365,-0.938874,True,False,True,False,True,False,True,...,False,True,False,False,False,True,False,False,True,False


In [27]:
df1 = df1.replace(to_replace=[False,True], value=[0,1])
df1.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_0,Partner_1,Dependents_0,...,StreamingMovies_No internet service,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_0,PaperlessBilling_1,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.992611,1,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,0.066327,-0.259629,-0.172165,0,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,-1.236724,-0.36266,-0.958066,0,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0.514251,-0.746535,-0.193672,0,1,1,0,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,-1.236724,0.197365,-0.938874,1,0,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


### Splitting the dataset

In [28]:
#Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df1, y, test_size=0.2, random_state=1)
print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_test shape: {}'.format(y_test.shape))

X_train shape: (5634, 46)
y_train shape: (5634,)
X_test shape: (1409, 46)
y_test shape: (1409,)


In [29]:
#transform train and test set using standard scaler
#ignore scaling for y since sklarn preprocessing ignores the y variable and it is also an object type.

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [30]:
#put the scaled sets into a daataframe

X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)

### Building the models

#### Random Forest Classifier

In [31]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 1)

#fit on train set
rf.fit(X_train_scaled, y_train)

In [32]:
#make predictions on test set
rf_pred = rf.predict(X_test_scaled)

In [33]:
# Evaluating performance on the Random Forest Classifier
#model accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, rf_pred)
print('Accuracy: {}'.format(round(accuracy*100), 4))

from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix, classification_report

#precision
precision = precision_score(y_test, rf_pred)
print('Precision: {}'.format(round(precision*100), 4))  

#recall
recall = recall_score(y_test, rf_pred)
print('Recall: {}'.format(round(recall*100), 4))

#F1 score
f1 = f1_score(y_test, rf_pred)
print('F1: {}'.format(round(f1*100), 4))

#classification report
print('Classification Report:\n', classification_report(y_test,rf_pred, digits =4))

#confusion matrix
rf_cnf_mat = confusion_matrix(y_test, rf_pred)
print('Confusion Matrix:\n', rf_cnf_mat)

Accuracy: 80
Precision: 60
Recall: 55
F1: 58
Classification Report:
               precision    recall  f1-score   support

           0     0.8570    0.8812    0.8690      1061
           1     0.6038    0.5517    0.5766       348

    accuracy                         0.7999      1409
   macro avg     0.7304    0.7165    0.7228      1409
weighted avg     0.7945    0.7999    0.7967      1409

Confusion Matrix:
 [[935 126]
 [156 192]]


In [34]:
print("Training set score: {:.3f}".format(rf.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}".format(rf.score(X_test_scaled, y_test)))

Training set score: 0.998
Test set score: 0.800


#### Extra Tree Classifier

In [35]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(random_state = 1)

#fit on the train set
etc.fit(X_train_scaled, y_train)

In [36]:
# predicting on the test set
etc_pred = etc.predict(X_test_scaled)

In [37]:
# Evaluating model performance on the Extra Tree Classifier
#model accuracy
etc_accuracy = accuracy_score(y_test, etc_pred)
print('Accuracy: {}'.format(round(etc_accuracy*100), 4))

#precision
etc_precision = precision_score(y_test, etc_pred)
print('Precision: {}'.format(round(etc_precision*100), 4))  

#recall
etc_recall = recall_score(y_test, etc_pred)
print('Recall: {}'.format(round(etc_recall*100), 4))

#F1 score
etc_f1 = f1_score(y_test, etc_pred)
print('F1: {}'.format(round(etc_f1*100), 4))

#classification report
print('Classification Report:\n', classification_report(y_test,etc_pred, digits =4))

#confusion matrix
etc_cnf_mat = confusion_matrix(y_test, etc_pred)
print('Confusion Matrix:\n', etc_cnf_mat)

Accuracy: 77
Precision: 53
Recall: 49
F1: 51
Classification Report:
               precision    recall  f1-score   support

           0     0.8370    0.8567    0.8468      1061
           1     0.5294    0.4914    0.5097       348

    accuracy                         0.7665      1409
   macro avg     0.6832    0.6741    0.6782      1409
weighted avg     0.7610    0.7665    0.7635      1409

Confusion Matrix:
 [[909 152]
 [177 171]]


In [38]:
print("Training set score: {:.3f}".format(etc.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}".format(etc.score(X_test_scaled, y_test)))

Training set score: 0.998
Test set score: 0.767


#### XGBoost

In [39]:
!pip install xgboost
import xgboost
from xgboost import XGBClassifier

xbc= XGBClassifier(random_state = 1)

#fit on train set
xbc.fit(X_train_scaled, y_train)



In [40]:
# making predictions on the test set
xbc_pred = xbc.predict(X_test_scaled)

#### Measuring performance on xgboost

In [41]:
#model accuracy
xbc_accuracy = accuracy_score(y_test, xbc_pred)
print('Accuracy: {}'.format(round(xbc_accuracy*100), 4))

#precision
xbc_precision = precision_score(y_test, xbc_pred)
print('Precision: {}'.format(round(xbc_precision*100), 4))  

#recall
xbc_recall = recall_score(y_test, xbc_pred)
print('Recall: {}'.format(round(xbc_recall*100), 4))

#F1 score
xbc_f1 = f1_score(y_test, xbc_pred)
print('F1: {}'.format(round(xbc_f1*100), 4))

#classification report
print('Classification Report:\n', classification_report(y_test,xbc_pred, digits =4))

#confusion matrix
xbc_cnf_mat = confusion_matrix(y_test, xbc_pred)
print('Confusion Matrix:\n', xbc_cnf_mat)

Accuracy: 79
Precision: 59
Recall: 56
F1: 57
Classification Report:
               precision    recall  f1-score   support

           0     0.8571    0.8709    0.8640      1061
           1     0.5861    0.5575    0.5714       348

    accuracy                         0.7935      1409
   macro avg     0.7216    0.7142    0.7177      1409
weighted avg     0.7902    0.7935    0.7917      1409

Confusion Matrix:
 [[924 137]
 [154 194]]


#### LGBM Classifier

In [42]:
!pip install lightgbm
import lightgbm
from lightgbm import LGBMClassifier

lgbm= LGBMClassifier(random_state = 1)

#fit on train set
lgbm.fit(X_train_scaled, y_train)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 713
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [43]:
# Making predictions on the test set
lgbm_pred = lgbm.predict(X_test_scaled)

In [44]:
# Evaluating performance for lgbm classifier
#model accuracy
lgbm_accuracy = accuracy_score(y_test, lgbm_pred)
print('Accuracy: {}'.format(round(lgbm_accuracy*100), 4))

#precision
lgbm_precision = precision_score(y_test, lgbm_pred)
print('Precision: {}'.format(round(lgbm_precision*100), 4))  

#recall
lgbm_recall = recall_score(y_test, lgbm_pred)
print('Recall: {}'.format(round(lgbm_recall*100), 4))

#F1 score
lgbm_f1 = f1_score(y_test, lgbm_pred)
print('F1: {}'.format(round(lgbm_f1*100), 4))

#classification report
print('Classification Report:\n', classification_report(y_test,lgbm_pred, digits =4))

#confusion matrix
lgbm_cnf_mat = confusion_matrix(y_test, lgbm_pred)
print('Confusion Matrix:\n', lgbm_cnf_mat)

Accuracy: 80
Precision: 61
Recall: 59
F1: 60
Classification Report:
               precision    recall  f1-score   support

           0     0.8657    0.8746    0.8701      1061
           1     0.6053    0.5862    0.5956       348

    accuracy                         0.8034      1409
   macro avg     0.7355    0.7304    0.7329      1409
weighted avg     0.8014    0.8034    0.8023      1409

Confusion Matrix:
 [[928 133]
 [144 204]]


#### Improving the Extra Trees Classifier

In [45]:
#combination of hyperparameters
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

In [46]:
from sklearn.model_selection import RandomizedSearchCV

#etc = ExtraTreesClassifier(random_state = 1)

#setting up randomsearch with 5folds

randomcv = RandomizedSearchCV(estimator = etc, 
                              param_distributions = hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1,
                              random_state = 1)

In [47]:
#fit on the training data
search = randomcv.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\PROSPERITY\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\PROSPERITY\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\PROSPERITY\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\PROSPERITY\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    ra

In [48]:
#get best parameters
search.best_params_

{'n_estimators': 50,
 'min_samples_split': 7,
 'min_samples_leaf': 6,
 'max_features': 'log2'}

In [49]:
#Evaluate ExtraTreesClassifier on test set using  best params
etc2 = ExtraTreesClassifier(max_features = "log2", 
                            min_samples_leaf= 6,
                            min_samples_split= 7,
                            n_estimators= 50, 
                            random_state = 1)

#fit on train set
etc2.fit(X_train_scaled, y_train)

In [50]:
#check feature importances
importance = etc2.feature_importances_

In [51]:
#print feature importances
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.07567
Feature: 1, Score: 0.02113
Feature: 2, Score: 0.05439
Feature: 3, Score: 0.01080
Feature: 4, Score: 0.01136
Feature: 5, Score: 0.00770
Feature: 6, Score: 0.00722
Feature: 7, Score: 0.00885
Feature: 8, Score: 0.01046
Feature: 9, Score: 0.00550
Feature: 10, Score: 0.00836
Feature: 11, Score: 0.00365
Feature: 12, Score: 0.00453
Feature: 13, Score: 0.00907
Feature: 14, Score: 0.00885
Feature: 15, Score: 0.00298
Feature: 16, Score: 0.01419
Feature: 17, Score: 0.03258
Feature: 18, Score: 0.07031
Feature: 19, Score: 0.05246
Feature: 20, Score: 0.01732
Feature: 21, Score: 0.00362
Feature: 22, Score: 0.02395
Feature: 23, Score: 0.01709
Feature: 24, Score: 0.01034
Feature: 25, Score: 0.02315
Feature: 26, Score: 0.00686
Feature: 27, Score: 0.00935
Feature: 28, Score: 0.04682
Feature: 29, Score: 0.01559
Feature: 30, Score: 0.00806
Feature: 31, Score: 0.00949
Feature: 32, Score: 0.00881
Feature: 33, Score: 0.00440
Feature: 34, Score: 0.00971
Feature: 35, Score: 0.01037
Fe

In [52]:
X_train_scaled.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_0,Partner_1,Dependents_0,...,StreamingMovies_No internet service,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_0,PaperlessBilling_1,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-0.825884,-1.49753,-0.890947,-0.992573,0.992573,0.439475,-0.439475,-1.041692,1.041692,-1.543293,...,1.901118,-1.107645,-0.516369,1.786607,1.198248,-1.198248,-0.525456,-0.524906,-0.711347,1.82133
1,0.395961,0.302996,0.389693,1.007483,-1.007483,0.439475,-0.439475,0.959977,-0.959977,0.647965,...,-0.526006,-1.107645,1.936601,-0.55972,1.198248,-1.198248,-0.525456,1.905103,-0.711347,-0.549049
2,1.577078,0.01232,1.060945,-0.992573,0.992573,0.439475,-0.439475,-1.041692,1.041692,0.647965,...,-0.526006,-1.107645,-0.516369,1.786607,1.198248,-1.198248,1.903108,-0.524906,-0.711347,-0.549049
3,1.577078,0.686687,1.775397,-0.992573,0.992573,0.439475,-0.439475,-1.041692,1.041692,-1.543293,...,-0.526006,-1.107645,1.936601,-0.55972,1.198248,-1.198248,-0.525456,-0.524906,1.405784,-0.549049
4,-0.092777,0.186726,-0.102671,-0.992573,0.992573,0.439475,-0.439475,0.959977,-0.959977,0.647965,...,-0.526006,-1.107645,1.936601,-0.55972,1.198248,-1.198248,-0.525456,-0.524906,1.405784,-0.549049


In [53]:
#get best score
search.best_score_

0.7928635258258064

In [54]:
#predict on test set
etc2_pred = etc2.predict(X_test_scaled)

In [55]:
#classification report
print('Classification Report:\n', classification_report(y_test,etc2_pred, digits =4))

Classification Report:
               precision    recall  f1-score   support

           0     0.8544    0.8963    0.8749      1061
           1     0.6284    0.5345    0.5776       348

    accuracy                         0.8070      1409
   macro avg     0.7414    0.7154    0.7263      1409
weighted avg     0.7986    0.8070    0.8015      1409

