# In this kernel, we focused on using 2 types of ensemble methods **VOTING** and **STACKING** and chose the best one with the best **AUC score** 

**Steps**:


1.   Import Libraries and Dataset
2.   Getting to know the dataset
3.   EDA
4.   Feature Selection (using statistic test)
5.   Preprocessing
6.   Train/Test split 
7.   Correlation Matrix 
8.   Cross Validation
9.   Modeling  
    9.1. Test Different algorithms and Pick the 5 best ones
    
    9.2. Evaluation Function

    9.3. Hyperparameter Tuning 

    9.4. Trying the voting ensemble method and see the results

    9.5. Trying The stacking ensembling method on different combinaisons

  

10. Final decision and save the best model 












# **1. Import Libraries and Dataset**


In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas_profiling
from sklearn.preprocessing import StandardScaler
from pandas.plotting import scatter_matrix
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve,KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve,auc
from mlxtend.classifier import StackingCVClassifier
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [None]:
pd.set_option('display.max_columns', 500)
data=pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
data

# **2. Getting to know the dataset**

In [None]:
print ("DATA SHAPES : ",data.shape)
data.info()

###Change the type of the feature ToTalCharges from Object to Float

In [None]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'],errors='coerce')

###Seeing that the feature TotalCharges has 11 missed values 

In [None]:
data.isnull().sum()

###Seeing the Different unique outputs of the categorical features

In [None]:
def uni_col_val(df):
    for column in df:
        if df[column].dtype == 'object':
            print(f'{column} : {df[column].unique()}')

In [None]:
uni_col_val(data)

# **3. EDA**

In [None]:
data.describe()

In [None]:
data['Churn'].hist()
#Not BaLANCED DATASET

In [None]:
scatter_matrix(data,alpha=0.2, figsize=(10, 10))

In [None]:
fig, axes = plt.subplots(nrows = 3,ncols = 4,figsize = (35,25))
sns.countplot(x="gender", hue="Churn", data=data, ax=axes[0][0] )
sns.countplot(x="SeniorCitizen", hue="Churn", data=data,ax=axes[0][1] )
sns.countplot(x="Partner", hue="Churn", data=data,ax=axes[0][2] ) 
sns.countplot(x="PhoneService", hue="Churn", data=data,ax=axes[0][3] )

sns.countplot(x="InternetService", hue="Churn", data=data, ax=axes[1][0] )
sns.countplot(x="OnlineSecurity", hue="Churn", data=data,ax=axes[1][1] )
sns.countplot(x="Contract", hue="Churn", data=data,ax=axes[1][2] ) 
sns.countplot(x="OnlineBackup", hue="Churn", data=data,ax=axes[1][3] )


sns.countplot(x="TechSupport", hue="Churn", data=data, ax=axes[2][0] )
sns.countplot(x="PaymentMethod", hue="Churn", data=data,ax=axes[2][1] )
sns.countplot(x="StreamingTV", hue="Churn", data=data,ax=axes[2][2] ) 
sns.countplot(x="StreamingMovies", hue="Churn", data=data,ax=axes[2][3] )

# **4. Feature Selection (using statistic test)**

In [None]:
data.info()

In [None]:
categorical_features=['gender','Partner','Dependents','PhoneService',  
                        'MultipleLines' ,'InternetService','OnlineSecurity','OnlineBackup',
                        'DeviceProtection','TechSupport', 'StreamingTV' ,'StreamingMovies' ,
                        'Contract','PaperlessBilling','PaymentMethod']


###Statistical Test to determine whether input features are relevant to the outcome to be predicted.

###P-value <= 0.05 significant result
###P-value > 0.05 not significant result

In [None]:

statistical_significance=[]
for attr in categorical_features:
    data_count=pd.crosstab(data[attr],data["Churn"])
    #print(data_count)
    obs=data_count.values
    #print(obs)
    chi2, p, dof, expected = stats.chi2_contingency(obs)
    statistical_significance.append([attr,round(p,4)])
statistical_significance=pd.DataFrame(statistical_significance)
statistical_significance.columns=["Attribute","P-value"]
display(statistical_significance)

###We drop the non siginificant features (P-value>0.05)

In [None]:
data=data.drop(['gender','PhoneService'],axis=1)

# **5. Processing**

In [None]:

data.Churn=data.Churn.replace('Yes','1')
data.Churn=data.Churn.replace('No','0')
print('churn :\n',data.Churn.value_counts())
data['Partner'] = data['Partner'].map(lambda s :1  if s =='Yes' else 0)
print('Partner : \n ',data['Partner'].value_counts())

data['Dependents'] = data['Dependents'].map(lambda s :1  if s =='Yes' else 0) 
print('Dependents :\n',data.Dependents.value_counts())

data['PaperlessBilling'] = data['PaperlessBilling'].map(lambda s :1  if s =='Yes' else 0) 
print('PaperlessBilling :\n',data.PaperlessBilling.value_counts())

#data['PhoneService'] = data['PhoneService'].map(lambda s :1  if s =='Yes' else 0)
#print('PhoneService :\n',data.PhoneService.value_counts())


data['MultipleLines'] = data['MultipleLines'].map(lambda s :1  if s =='Yes' else 0)
print('MultipleLines :\n',data.MultipleLines.value_counts())

data['OnlineBackup'] = data['OnlineBackup'].map(lambda s :1  if s =='Yes' else 0)
print('Online Backup :\n',data.OnlineBackup.value_counts())


data['OnlineSecurity'] = data['OnlineSecurity'].map(lambda s :1  if s =='Yes' else 0)
print('Online Security :\n',data.OnlineSecurity.value_counts())


data['DeviceProtection'] = data['DeviceProtection'].map(lambda s :1  if s =='Yes' else 0)
print('DeviceProtection :\n',data.DeviceProtection.value_counts())

data['TechSupport'] = data['TechSupport'].map(lambda s :1  if s =='Yes' else 0)
print('Tech Support :\n',data.TechSupport.value_counts())


data['StreamingTV'] = data['StreamingTV'].map(lambda s :1  if s =='Yes' else 0)
print('StreamingTV  :\n',data.StreamingTV.value_counts())

data['StreamingMovies'] = data['StreamingMovies'].map(lambda s :1  if s =='Yes' else 0)
print('StreamingMovies  :\n',data.StreamingMovies.value_counts())

In [None]:
data.head()

In [None]:
data.info()

###Encoding

In [None]:
data_dummied = pd.get_dummies(data,columns=['PaymentMethod','Contract','InternetService'])

In [None]:
data_dummied['TotalCharges']=data_dummied['TotalCharges'].fillna(data_dummied['TotalCharges'].mean())

###Normalization

In [None]:


min_max_scaler = MinMaxScaler()
numeric_columns=['tenure','MonthlyCharges','TotalCharges']
data_dummied[numeric_columns]=min_max_scaler.fit_transform(data_dummied[numeric_columns])

In [None]:
data_dummied.info()

In [None]:
data_dummied

In [None]:
data_dummied=data_dummied.drop('customerID',axis=1)

# **6. Train/Test split**

In [None]:
Y=data_dummied['Churn']
X=data_dummied.drop(['Churn'],axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
         X, Y, test_size=0.10, random_state=5,shuffle=True,stratify=Y)


In [None]:
y_train.hist()

In [None]:
y_test.hist()

# **7. Correlation Matrix**

In [None]:
fig,axe = plt.subplots(figsize = (20,20))
sns.heatmap(X_train.corr(method='spearman'),annot=True,ax=axe)

###Changing the target type from str to int

In [None]:
y_test=y_test.astype(str).astype(int)
y_train=y_train.astype(str).astype(int)


# **8. Cross Validation**

In [None]:
#kfold = StratifiedKFold(n_splits=10)
#kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
kfold=StratifiedKFold(n_splits=5,shuffle=True)

In [None]:
from sklearn.utils import class_weight

In [None]:
class_weight=int(y_train.value_counts()[0]/y_train.value_counts()[1])
class_weight

In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)

#**9 Modeling**

## **9.1 Test Different algorithms and Pick the 5 best ones** 

In [None]:
random_state = 2
classifiers = []
classifiers.append(SVC(random_state=random_state,class_weight=dict(enumerate(class_weights))))
classifiers.append(DecisionTreeClassifier(random_state=random_state,class_weight=dict(enumerate(class_weights))))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state,class_weight=dict(enumerate(class_weights))),random_state=random_state,learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state,class_weight=dict(enumerate(class_weights))))
classifiers.append(ExtraTreesClassifier(random_state=random_state,class_weight=dict(enumerate(class_weights))))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(xgboost.XGBClassifier(random_state=random_state,class_weight=class_weights))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state,class_weight=class_weights))


In [None]:
cv_results = []
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, X_train, y = y_train, scoring = "roc_auc", cv =kfold , n_jobs=-1))

In [None]:
cv_means = []
cv_std   = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["SVC","DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","Xgboost","KNeighboors","LogisticRegression"]})
print(cv_res)

g = sns.barplot("CrossValMeans","Algorithm",data = cv_res, palette="Set3",orient = "h",**{'xerr':cv_std})
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross validation scores")

## **9.2 Evaluation Function**

In [None]:
def evaluation(model,x_test,y_test):
    

    print("Test AUC:",roc_auc_score(y_test, model.predict_proba(x_test)[:,1]))
    print('Train AU  :',roc_auc_score(y_train, model.predict_proba(X_train)[:,1]))
    #print(roc_auc_score(y_test, y_pred))
    axe.set_xlabel('Actual')
    axe.set_ylabel('Predicted')
    
    

## **9.3. Hyperparameter tuning**

# GradientBoosting

In [None]:
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
                 
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              
              }
gsGBC=RandomizedSearchCV(estimator=GBC,param_distributions=gb_param_grid,random_state=3,scoring = "roc_auc", 
                                     cv =kfold,n_jobs=1)
 

gsGBC.fit(X_train,y_train)

In [None]:
GBC_best = gsGBC.best_estimator_

# Best score
gsGBC.best_score_

In [None]:
evaluation(GBC_best ,X_test,y_test)

### RandomForrest

In [None]:
RFC = RandomForestClassifier(class_weight=dict(enumerate(class_weights)))
#class_weight_dict = )
#
## Search grid for optimal parameters
rf_param_grid = {"max_depth": [None],
              
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="roc_auc", n_jobs= -1, verbose = 1)

gsRFC.fit(X_train,y_train)




In [None]:
RFC_best = gsRFC.best_estimator_

# Best score
gsRFC.best_score_

In [None]:
evaluation(RFC_best,X_test,y_test)

### SVM

In [None]:
SVMC = SVC(probability=True,class_weight=dict(enumerate(class_weights)))
svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 100,200]}

gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring="roc_auc", n_jobs= -1, verbose = 1)

gsSVMC.fit(X_train,y_train)

In [None]:
SVMC_best = gsSVMC.best_estimator_

# Best score
gsSVMC.best_score_

In [None]:
evaluation(gsSVMC,X_test,y_test)

### XGBOOST

In [None]:
class_weight=int(y_train.value_counts()[0]/y_train.value_counts()[1])

In [None]:
XGBoost=xgboost.XGBClassifier(scale_pos_weight=class_weight)
xgboost_param_grid={
    'learning_rate':[0.05,0.1,0.15,0.2,0.25,0.3],
    'max_depth'    :[1,2,3,4,5,6,7,8,9,10],
    'min_child_weight':[1,3,5,7],
    'colsample_bytree':[0.3,0.4,0.5,0.6,0.7],
    'gamma':[0.0,0.1,0.2,0.3,0.4,0.5]
}
gsXGoost=RandomizedSearchCV(estimator=XGBoost,param_distributions=xgboost_param_grid,random_state=3,scoring = "roc_auc", 
                                     cv =kfold,n_jobs=1)


gsXGoost.fit(X_train,y_train)

In [None]:
XGBOOST_best = gsXGoost.best_estimator_

# Best score
gsXGoost.best_score_

In [None]:
evaluation(XGBOOST_best,X_test,y_test)

### LogisticRegression

In [None]:
logreg = LogisticRegression(class_weight=dict(enumerate(class_weights)))
logreg_param_grid={
    'C':[100, 10, 1.0, 0.1, 0.01] ,
    'penalty':['l2']
}
gslogreg=GridSearchCV(logreg,param_grid = logreg_param_grid, cv=kfold, scoring="roc_auc", n_jobs= -1, verbose = 1)
gslogreg.fit(X_train,y_train)

In [None]:
LogReg_best = gslogreg.best_estimator_

# Best score
gslogreg.best_score_

In [None]:
evaluation(LogReg_best,X_test,y_test)

## **9.4. Trying the voting ensemble method and see the results**

In [None]:
votingC = VotingClassifier(estimators=[('gbc',GBC_best), ('rfc', RFC_best),
('xgboost',XGBOOST_best),('logreg',LogReg_best)], voting='soft', n_jobs=4)

votingC = votingC.fit(X_train, y_train)

In [None]:
evaluation(votingC,X_test,y_test)

## **9.5.Trying The stacking ensembling method on different combinaisons**

In [None]:
scv=StackingCVClassifier(classifiers=[GBC_best,RFC_best,XGBOOST_best,LogReg_best],meta_classifier= XGBOOST_best,random_state=42)

In [None]:
scv2=StackingCVClassifier(classifiers=[GBC_best,RFC_best,XGBOOST_best,LogReg_best],meta_classifier= RFC_best,random_state=42)

In [None]:
scv3=StackingCVClassifier(classifiers=[RFC_best,XGBOOST_best,LogReg_best,SVMC_best],meta_classifier= RFC_best,random_state=42)

In [None]:
scv.fit(X_train,y_train)
evaluation(scv,X_test,y_test)

In [None]:
scv3.fit(X_train,y_train)
evaluation(scv3,X_test,y_test)

In [None]:
scv2.fit(X_train,y_train)
evaluation(scv2,X_test,y_test)

In [None]:
scv4=StackingCVClassifier(cv=2,classifiers=[RFC_best,SVMC_best,XGBOOST_best,LogReg_best],meta_classifier= GBC_best,random_state=42)

In [None]:
scv4.fit(X_train,y_train)
evaluation(scv4,X_test,y_test)

# **10. Final decision and save the best model** 

###WE decided to pick the model generated by the voting method and save it

In [None]:
import pickle
pickle.dump(votingC, open("best_model_Votingg", 'wb'))