In [1]:
!pip3 install imbalanced-learn

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [4]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [6]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score,recall_score, precision_score
from sklearn.metrics import average_precision_score, roc_auc_score, roc_curve, auc

In [32]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [7]:
df = pd.read_csv('telecom_churn_preprocess_data.csv')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,streamingmovies,paperlessbilling,monthlycharges,totalcharges,churn,contract_One year,contract_Two year,paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0,0,0,1,0,1,0,1,0,0,...,0,1,29.85,29,0,0,0,0,1,0
1,1,1,0,0,0,34,1,0,0,2,...,0,0,56.95,1889,0,1,0,0,0,1
2,2,1,0,0,0,2,1,0,0,2,...,0,1,53.85,108,1,0,0,0,0,1
3,3,1,0,0,0,45,0,1,0,2,...,0,0,42.3,1840,0,1,0,0,0,0
4,4,0,0,0,0,2,1,0,1,0,...,0,1,70.7,151,1,0,0,0,1,0


In [9]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
x = df.drop("churn", axis = 1)
y = df['churn']

In [35]:
#We are deleting totalcharges due to high correlation with tenure column.
x.drop('totalcharges', axis=1, inplace=True)

In [39]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state =100 ,stratify=y, test_size = 0.2)
print(y_train.value_counts())

0    4131
1    1485
Name: churn, dtype: int64


In [40]:
scale = MinMaxScaler()
col = x_train.columns

x_train[col] = scale.fit_transform(x_train[col])
x_test[col] = scale.transform(x_test[col])

In [41]:
#Fit basic model on imbalanced dataset
DT = DecisionTreeClassifier(random_state=0)
cross_val_DT = cross_val_score(DT, x_train, y_train , scoring = 'accuracy' ) 
cross_val_DT_f1 = cross_val_score(DT, x_train, y_train , scoring = 'f1') 

RF = RandomForestClassifier(random_state=0)
cross_val_RF = cross_val_score(RF, x_train, y_train , scoring = 'accuracy' ) 
cross_val_RF_f1 = cross_val_score(RF, x_train, y_train , scoring = 'f1') 

ET = ExtraTreesClassifier(random_state=0)
cross_val_ET = cross_val_score(ET, x_train, y_train , scoring = 'accuracy' ) 
cross_val_ET_f1 = cross_val_score(ET, x_train, y_train , scoring = 'f1') 

MLP = MLPClassifier(random_state=0)
cross_val_MLP = cross_val_score(MLP, x_train, y_train , scoring = 'accuracy' ) 
cross_val_MLP_f1 = cross_val_score(MLP, x_train, y_train , scoring = 'f1') 

GB = GradientBoostingClassifier(random_state=0)
cross_val_GB = cross_val_score(GB, x_train, y_train , scoring = 'accuracy' ) 
cross_val_GB_f1 = cross_val_score(GB, x_train, y_train , scoring = 'f1') 

LR = LogisticRegression(random_state=0)
cross_val_LR = cross_val_score(LR, x_train, y_train , scoring = 'accuracy' ) 
cross_val_LR_f1 = cross_val_score(LR, x_train, y_train , scoring = 'f1') 

KNN = KNeighborsClassifier()
cross_val_KNN = cross_val_score(KNN, x_train, y_train , scoring = 'accuracy' ) 
cross_val_KNN_f1 = cross_val_score(KNN, x_train, y_train , scoring = 'f1') 

In [42]:
compare_models = [('Decision Tree', cross_val_DT.mean(),cross_val_DT_f1.mean()),
                  ('Random Forest', cross_val_RF.mean(),cross_val_RF_f1.mean()),
                  ('Neural Network', cross_val_MLP.mean(),cross_val_MLP_f1.mean()),
                  ('Extra Tree', cross_val_ET.mean(),cross_val_ET_f1.mean()),
                  ('GradientBoosting', cross_val_GB.mean(),cross_val_GB_f1.mean()),
                  ('Logistic Regression', cross_val_LR.mean(),cross_val_LR_f1.mean()),
                  ('KNN', cross_val_KNN.mean(), cross_val_KNN_f1.mean())]

In [43]:
predict = pd.DataFrame(data = compare_models, 
                       columns=['Model','Cross_validation_mean(accuracy)', 'Cross_validation_mean(f1_score)'])
predict.style.background_gradient(cmap='YlGn')

Unnamed: 0,Model,Cross_validation_mean(accuracy),Cross_validation_mean(f1_score)
0,Decision Tree,0.717949,0.477875
1,Random Forest,0.780092,0.518339
2,Neural Network,0.781694,0.554622
3,Extra Tree,0.761575,0.496172
4,GradientBoosting,0.795048,0.557716
5,Logistic Regression,0.79754,0.575564
6,KNN,0.758548,0.52801


In [16]:
#Random OverSample
x_train_Sample = x_train.copy()
y_train_Sample = y_train.copy()

randOverSample = RandomOverSampler(random_state=100,)
x_train_OverSample, y_train_OverSample = randOverSample.fit_resample(x_train_Sample, y_train_Sample.ravel())

In [17]:
#Fit basic model on OverSample dataset
DT = DecisionTreeClassifier(random_state=0)
cross_val_DT = cross_val_score(DT, x_train_OverSample, y_train_OverSample , scoring = 'accuracy' ) 
cross_val_DT_f1 = cross_val_score(DT, x_train_OverSample, y_train_OverSample , scoring = 'f1') 

RF = RandomForestClassifier(random_state=0)
cross_val_RF = cross_val_score(RF, x_train_OverSample, y_train_OverSample , scoring = 'accuracy' ) 
cross_val_RF_f1 = cross_val_score(RF, x_train_OverSample, y_train_OverSample , scoring = 'f1') 

ET = ExtraTreesClassifier(random_state=0)
cross_val_ET = cross_val_score(ET, x_train_OverSample, y_train_OverSample , scoring = 'accuracy' ) 
cross_val_ET_f1 = cross_val_score(ET, x_train_OverSample, y_train_OverSample , scoring = 'f1') 

MLP = MLPClassifier(random_state=0)
cross_val_MLP = cross_val_score(MLP, x_train_OverSample, y_train_OverSample , scoring = 'accuracy' ) 
cross_val_MLP_f1 = cross_val_score(MLP, x_train_OverSample, y_train_OverSample , scoring = 'f1') 

GB = GradientBoostingClassifier(random_state=0)
cross_val_GB = cross_val_score(GB, x_train_OverSample, y_train_OverSample , scoring = 'accuracy' ) 
cross_val_GB_f1 = cross_val_score(GB, x_train_OverSample, y_train_OverSample , scoring = 'f1') 

LR = LogisticRegression(random_state=0)
cross_val_LR = cross_val_score(LR, x_train_OverSample, y_train_OverSample , scoring = 'accuracy' ) 
cross_val_LR_f1 = cross_val_score(LR, x_train_OverSample, y_train_OverSample , scoring = 'f1') 

KNN = KNeighborsClassifier()
cross_val_KNN = cross_val_score(KNN, x_train_OverSample, y_train_OverSample , scoring = 'accuracy' ) 
cross_val_KNN_f1 = cross_val_score(KNN, x_train_OverSample, y_train_OverSample , scoring = 'f1') 

In [18]:
compare_models = [('Decision Tree', cross_val_DT.mean(),cross_val_DT_f1.mean()),
                  ('Random Forest', cross_val_RF.mean(),cross_val_RF_f1.mean()),
                  ('Neural Network', cross_val_MLP.mean(),cross_val_MLP_f1.mean()),
                  ('Extra Tree', cross_val_ET.mean(),cross_val_ET_f1.mean()),
                  ('GradientBoosting', cross_val_GB.mean(),cross_val_GB_f1.mean()),
                  ('Logistic Regression', cross_val_LR.mean(),cross_val_LR_f1.mean()),
                  ('KNN', cross_val_KNN.mean(), cross_val_KNN_f1.mean())]

In [19]:
predict = pd.DataFrame(data = compare_models, 
                       columns=['Model','Cross_validation_mean(accuracy)', 'Cross_validation_mean(f1_score)'])
predict.style.background_gradient(cmap='YlGn')

Unnamed: 0,Model,Cross_validation_mean(accuracy),Cross_validation_mean(f1_score)
0,Decision Tree,0.87437,0.883954
1,Random Forest,0.901723,0.907736
2,Neural Network,0.704556,0.661856
3,Extra Tree,0.911286,0.915073
4,GradientBoosting,0.782983,0.791433
5,Logistic Regression,0.757201,0.763442
6,KNN,0.748489,0.768459


In [21]:
#SMOTE
x_train_sample = x_train.copy()
y_train_sample = y_train.copy()

sm = SMOTE(random_state=100)
x_train_smote, y_train_smote = sm.fit_resample(x_train_sample, y_train_sample.ravel())

In [22]:
#Fit basic model on OverSample dataset
DT = DecisionTreeClassifier(random_state=0)
cross_val_DT = cross_val_score(DT, x_train_smote, y_train_smote , scoring = 'accuracy' ) 
cross_val_DT_f1 = cross_val_score(DT, x_train_smote, y_train_smote , scoring = 'f1') 

RF = RandomForestClassifier(random_state=0)
cross_val_RF = cross_val_score(RF, x_train_smote, y_train_smote , scoring = 'accuracy' ) 
cross_val_RF_f1 = cross_val_score(RF, x_train_smote, y_train_smote , scoring = 'f1') 

ET = ExtraTreesClassifier(random_state=0)
cross_val_ET = cross_val_score(ET, x_train_smote, y_train_smote , scoring = 'accuracy' ) 
cross_val_ET_f1 = cross_val_score(ET, x_train_smote, y_train_smote , scoring = 'f1') 

MLP = MLPClassifier(random_state=0)
cross_val_MLP = cross_val_score(MLP, x_train_smote, y_train_smote , scoring = 'accuracy' ) 
cross_val_MLP_f1 = cross_val_score(MLP, x_train_smote, y_train_smote , scoring = 'f1') 

GB = GradientBoostingClassifier(random_state=0)
cross_val_GB = cross_val_score(GB, x_train_smote, y_train_smote , scoring = 'accuracy' ) 
cross_val_GB_f1 = cross_val_score(GB, x_train_smote, y_train_smote , scoring = 'f1') 

LR = LogisticRegression(random_state=0)
cross_val_LR = cross_val_score(LR, x_train_smote, y_train_smote , scoring = 'accuracy' ) 
cross_val_LR_f1 = cross_val_score(LR, x_train_smote, y_train_smote , scoring = 'f1') 

KNN = KNeighborsClassifier()
cross_val_KNN = cross_val_score(KNN, x_train_smote, y_train_smote , scoring = 'accuracy' ) 
cross_val_KNN_f1 = cross_val_score(KNN, x_train_smote, y_train_smote , scoring = 'f1') 

In [23]:
compare_models = [('Decision Tree', cross_val_DT.mean(),cross_val_DT_f1.mean()),
                  ('Random Forest', cross_val_RF.mean(),cross_val_RF_f1.mean()),
                  ('Neural Network', cross_val_MLP.mean(),cross_val_MLP_f1.mean()),
                  ('Extra Tree', cross_val_ET.mean(),cross_val_ET_f1.mean()),
                  ('GradientBoosting', cross_val_GB.mean(),cross_val_GB_f1.mean()),
                  ('Logistic Regression', cross_val_LR.mean(),cross_val_LR_f1.mean()),
                  ('KNN', cross_val_KNN.mean(), cross_val_KNN_f1.mean())]

In [24]:
predict = pd.DataFrame(data = compare_models, 
                       columns=['Model','Cross_validation_mean(accuracy)', 'Cross_validation_mean(f1_score)'])
predict.style.background_gradient(cmap='YlGn')

Unnamed: 0,Model,Cross_validation_mean(accuracy),Cross_validation_mean(f1_score)
0,Decision Tree,0.779494,0.76998
1,Random Forest,0.839772,0.828024
2,Neural Network,0.745354,0.691357
3,Extra Tree,0.827308,0.811384
4,GradientBoosting,0.823792,0.816621
5,Logistic Regression,0.793769,0.791159
6,KNN,0.772216,0.787037


In [27]:
#let's try out Random OverSampling technique on a RandomForest model.
RF = RandomForestClassifier(random_state=0)
RF.fit(x_train_OverSample,y_train_OverSample)
y_pred = RF.predict(x_test)
print ("Accuracy score: ",accuracy_score(y_pred,y_test))
print ("------------------------------")
print ("F1_score: ",f1_score(y_pred,y_test))

Accuracy score:  0.797864768683274
------------------------------
F1_score:  0.6151761517615176


In [29]:
#let's try out Random OverSampling technique on a Extra tree classifier.
ET = ExtraTreesClassifier(random_state=0)
ET.fit(x_train_OverSample,y_train_OverSample)
y_pred = ET.predict(x_test)
print ("Accuracy score: ",accuracy_score(y_pred,y_test))
print ("------------------------------")
print ("F1_score: ",f1_score(y_pred,y_test))

Accuracy score:  0.7843416370106762
------------------------------
F1_score:  0.5302325581395348


In [45]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
rf = RandomForestClassifier(random_state=0)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)