## The importance of customer churn prediction

It is essential to predict future customer attrition rate as it can have a significant effect on businesses or organizations, and influence future expected revenue. In addition, customer churn prediction gives us help to determine the lack of customer service. With respect to machine learning algorithms and data analysis, we are able to identify and improve the factors being responsible to customer churn rate.

**Packages**

In [None]:
import time
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

**Set Classifier Model**

The first step is to create Classifier model.
Regarding to using the different algorithms, we are going to create a model object that is a reusabale code.

In [None]:
class ClassifierModel():
    def __init__(self, df, classifier, num_fold, parameters=None):
        self.df = df
        self.classifier = classifier
        self.num_fold = num_fold
        self.parameters = parameters

        self.X_train, self.X_test, self.Y_train, self.Y_test = self.load_data()
        self.model = self.train_model()

    def load_data(self):
        self.x = self.df.drop(["Churn", "customerID"], axis=1)
        self.Y = self.df["Churn"]

        # Normalize data
        scaler = MinMaxScaler()
        scaler.fit(self.x)
        self.X = scaler.transform(self.x)

        # Separate train and test data
        X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.1, random_state=42)

        return X_train, X_test, Y_train, Y_test

    def train_model(self):
        start = time.time()
        # Build the model
        model = self.classifier(**self.parameters)

        # Fit the model
        model.fit(self.X_train, self.Y_train)

        # Get feature importances
        feature_importance = model.feature_importances_
        # Set pandas series to see feature importance
        model_importances = pd.Series(feature_importance,
                                      index=self.x.columns.values)  # x is the first one before normalizing
        print(model_importances)

        end = time.time() - start
        print("Elapsed time to tarin model = {} seconds".format(end))

        # Predict the model
        predictions = model.predict(self.X_test)

        return model

    def evaluate_model(self):
        cv = KFold(n_splits=self.num_fold, random_state=42, shuffle=True)
        recall = cross_val_score(self.model, self.X, self.Y, cv=cv, scoring="recall", n_jobs=-1)
        precision = cross_val_score(self.model, self.X, self.Y, cv=cv, scoring="precision", n_jobs=-1)
        accuracy = cross_val_score(self.model, self.X, self.Y, cv=cv, scoring="accuracy", n_jobs=-1)
        f1 = cross_val_score(self.model, self.X, self.Y, cv=cv, scoring="f1_macro", n_jobs=-1)
        print("Accuracy in average = {}".format(np.mean(accuracy)))
        
        # Display all metrics in a dataframe
        metrics_df = pd.DataFrame([[accuracy, precision, recall, f1]], columns=["Accuracy", "Precision", "Recall", "F1 Score"])
        return metrics_df

**Set the parameters**

In this stage, we config the training parameters for each ensemble model.

In [None]:
dataset_dir = "../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv"
category_columns = ["Contract", "gender", "Partner","Dependents", "PhoneService", "MultipleLines",
                    "InternetService", "OnlineSecurity","OnlineBackup", "DeviceProtection","TotalCharges",
                    "TechSupport","StreamingTV", "StreamingMovies" , "PaperlessBilling", "PaymentMethod", "Churn"]
NUM_FOLD = 5

xgb_params = {
    "n_estimators": 300,
    "learning_rate": 0.001,
    "min_child_weight": 1,
    "base_score": 0.5,
    "gamma": 0,
    "min_child_weight": 1,
    "silent" : 1,
}
gb_params = {
    "n_estimators": 300,
    "learning_rate" : 0.008,
}
ada_params = {
    "n_estimators": 400,
    "learning_rate" : 0.01,
}
rf_params = {
    "n_estimators": 300,
    "min_samples_leaf" : 3,
    "max_features" : "sqrt",
}
dt_params = {
    "min_samples_leaf" : 2,
}
et_params = {
    "n_estimators": 300,
    "min_samples_leaf" : 5,
    "min_samples_leaf" : 2,
    "n_jobs" : -1
}

**Training Initializations**

First, we load the csv file and then encode the categorical culomns.
Second, using the model object, we set the models, separately.

In [None]:
data = pd.read_csv(dataset_dir)
# Encode Categorical Columns
labelencoder = LabelEncoder()
data[category_columns] = data[category_columns].apply(labelencoder.fit_transform)


# XGBoost Classifier
xgb_clf = XGBClassifier
xgb_model = ClassifierModel(df=data, classifier=xgb_clf, num_fold=NUM_FOLD, parameters=xgb_params)
xgb_model.train_model()
# Evaluation model
xgb_metrics = xgb_model.evaluate_model() 
xgb_metrics.index = ["XGBoost"]
results = xgb_metrics

# GBoost Classifier
gb_clf = GradientBoostingClassifier
gb_model = ClassifierModel(df=data, classifier=gb_clf, num_fold=NUM_FOLD, parameters=gb_params)
gb_model.train_model()
# Evaluation model
gb_metrics = gb_model.evaluate_model() 
gb_metrics.index = ["GBoost"]
results = results.append(gb_metrics)

# AdaBoost Classifier
ada_clf = AdaBoostClassifier
ada_model = ClassifierModel(df=data, classifier=ada_clf, num_fold=NUM_FOLD, parameters=ada_params)
ada_model.train_model()
# Evaluation model
ada_metrics = ada_model.evaluate_model() 
ada_metrics.index = ["AdaBoost"]
results = results.append(ada_metrics)

# Random Forest Classifier
rf_clf = RandomForestClassifier
rf_model = ClassifierModel(df=data, classifier=rf_clf, num_fold=NUM_FOLD, parameters=rf_params)
rf_model.train_model()
# Evaluation model
rf_metrics = rf_model.evaluate_model() 
rf_metrics.index = ["Random Forest"]
results = results.append(rf_metrics)

# Decision Tree Classifier
dt_clf = DecisionTreeClassifier
dt_model = ClassifierModel(df=data, classifier=dt_clf, num_fold=NUM_FOLD, parameters=dt_params)
dt_model.train_model()
# Evaluation model
dt_metrics = dt_model.evaluate_model() 
dt_metrics.index = ["Decision Tree"]
results = results.append(dt_metrics)

# ExtraTreesClassifier
et_clf = ExtraTreesClassifier
et_model = ClassifierModel(df=data, classifier=et_clf, num_fold=NUM_FOLD, parameters=et_params)
et_model.train_model()
# Evaluation model
et_metrics = et_model.evaluate_model() 
et_metrics.index = ["ExtraTrees"]
results = results.append(et_metrics)
print(results.head())