In [1]:
!pip3 install imbalanced-learn

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [4]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [6]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score,recall_score, precision_score
from sklearn.metrics import average_precision_score, roc_auc_score, roc_curve, auc

In [7]:
df = pd.read_csv('telecom_churn_preprocess_data.csv')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,streamingmovies,paperlessbilling,monthlycharges,totalcharges,churn,contract_One year,contract_Two year,paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0,0,0,1,0,1,0,1,0,0,...,0,1,29.85,29,0,0,0,0,1,0
1,1,1,0,0,0,34,1,0,0,2,...,0,0,56.95,1889,0,1,0,0,0,1
2,2,1,0,0,0,2,1,0,0,2,...,0,1,53.85,108,1,0,0,0,0,1
3,3,1,0,0,0,45,0,1,0,2,...,0,0,42.3,1840,0,1,0,0,0,0
4,4,0,0,0,0,2,1,0,1,0,...,0,1,70.7,151,1,0,0,0,1,0


In [9]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
x = df.drop("churn", axis = 1)
y = df['churn']

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state =100 ,stratify=y, test_size = 0.2)
print(y_train.value_counts())

0    4131
1    1485
Name: churn, dtype: int64


In [12]:
#Fit basic model on imbalanced dataset
DT = DecisionTreeClassifier(random_state=0)
cross_val_DT = cross_val_score(DT, x_train, y_train , scoring = 'accuracy' ) 
cross_val_DT_f1 = cross_val_score(DT, x_train, y_train , scoring = 'f1') 

RF = RandomForestClassifier(random_state=0)
cross_val_RF = cross_val_score(RF, x_train, y_train , scoring = 'accuracy' ) 
cross_val_RF_f1 = cross_val_score(RF, x_train, y_train , scoring = 'f1') 

ET = ExtraTreesClassifier(random_state=0)
cross_val_ET = cross_val_score(ET, x_train, y_train , scoring = 'accuracy' ) 
cross_val_ET_f1 = cross_val_score(ET, x_train, y_train , scoring = 'f1') 

MLP = MLPClassifier(random_state=0)
cross_val_MLP = cross_val_score(MLP, x_train, y_train , scoring = 'accuracy' ) 
cross_val_MLP_f1 = cross_val_score(MLP, x_train, y_train , scoring = 'f1') 

GB = GradientBoostingClassifier(random_state=0)
cross_val_GB = cross_val_score(GB, x_train, y_train , scoring = 'accuracy' ) 
cross_val_GB_f1 = cross_val_score(GB, x_train, y_train , scoring = 'f1') 

LR = LogisticRegression(random_state=0)
cross_val_LR = cross_val_score(LR, x_train, y_train , scoring = 'accuracy' ) 
cross_val_LR_f1 = cross_val_score(LR, x_train, y_train , scoring = 'f1') 

KNN = KNeighborsClassifier()
cross_val_KNN = cross_val_score(KNN, x_train, y_train , scoring = 'accuracy' ) 
cross_val_KNN_f1 = cross_val_score(KNN, x_train, y_train , scoring = 'f1') 

In [13]:
compare_models = [('Decision Tree', cross_val_DT.mean(),cross_val_DT_f1.mean()),
                  ('Random Forest', cross_val_RF.mean(),cross_val_RF_f1.mean()),
                  ('Neural Network', cross_val_MLP.mean(),cross_val_MLP_f1.mean()),
                  ('Extra Tree', cross_val_ET.mean(),cross_val_ET_f1.mean()),
                  ('GradientBoosting', cross_val_GB.mean(),cross_val_GB_f1.mean()),
                  ('Logistic Regression', cross_val_LR.mean(),cross_val_LR_f1.mean()),
                  ('KNN', cross_val_KNN.mean(), cross_val_KNN_f1.mean())]

In [14]:
predict = pd.DataFrame(data = compare_models, 
                       columns=['Model','Cross_validation_mean(accuracy)', 'Cross_validation_mean(f1_score)'])
predict.style.background_gradient(cmap='YlGn')

Unnamed: 0,Model,Cross_validation_mean(accuracy),Cross_validation_mean(f1_score)
0,Decision Tree,0.725424,0.488599
1,Random Forest,0.781516,0.525964
2,Neural Network,0.734497,0.505411
3,Extra Tree,0.771012,0.51568
4,GradientBoosting,0.798075,0.564039
5,Logistic Regression,0.795937,0.572056
6,KNN,0.76353,0.493781


In [15]:
RF = RandomForestClassifier()
RF.fit(x_train, y_train)
y_pred = RF.predict(x_test)
print ("Accuracy score: ", accuracy_score(y_test, y_pred))

Accuracy score:  0.8092526690391459


In [16]:
#Random OverSample
x_train_Sample = x_train.copy()
y_train_Sample = y_train.copy()

randOverSample = RandomOverSampler(random_state=100,)
x_train_OverSample, y_train_OverSample = randOverSample.fit_resample(x_train_Sample, y_train_Sample.ravel())