### Hi, welcome to my project!, today we will build random forest and extra trees classifiers to predict customer churn. 

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

%matplotlib inline

Let's read our csv file, take into account the file we will use today was generated from a previous project in which we worked with KNN.

In [None]:
data = pd.read_csv('../input/churndata/churndata_processed.csv')
data

In [None]:
data.columns

Let's see how many unique values we have in each column of our dataframe:

In [None]:
for x in data.columns:
    print(x, len(data[x].unique()))

In [None]:
data.dtypes

In [None]:
data.corr()

In [None]:
fig, ax = plt.subplots(figsize=(15,10)) 
sns.heatmap(data.corr())

Let's see how in the proportion of the classes in our label:

In [None]:
target='churn_value'
data[target].value_counts()

In [None]:
data[target].value_counts(normalize=True)

# Splitting our dataset:

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit


feature_cols = [x for x in data.columns if x != target]


# Split the data into two parts with 1500 points in the test data
# This creates a generator
strat_shuff_split = StratifiedShuffleSplit(n_splits=1, test_size=1500, random_state=42)

# Get the index values from the generator
train_idx, test_idx = next(strat_shuff_split.split(data[feature_cols], data[target]))

# Create the data sets
X_train = data.loc[train_idx, feature_cols]
y_train = data.loc[train_idx, target]

X_test = data.loc[test_idx, feature_cols]
y_test = data.loc[test_idx, target]

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

# Random Forest:

### Let's fit random forest models with a range of tree numbers, then evaluate the out-of-bag error for each of these.

In [None]:
# Suppress warnings about too few trees from the early models
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

Note: Since the only thing changing in our model is the number of trees, the **warm_start** flag can be used so that the model just adds more trees to the existing model each time. Thus we should have to use the **set_params** method to update the number of trees.

In [None]:
from sklearn.ensemble import RandomForestClassifier


RF = RandomForestClassifier(oob_score=True, 
                            random_state=42, 
                            warm_start=True,
                            n_jobs=-1)

oob_list = list()

for n_trees in [15, 20, 30, 40, 50, 100, 150, 200, 300, 400]:
    
    # Use this to set the number of trees
    RF.set_params(n_estimators=n_trees)

    # Fit the model
    RF.fit(X_train, y_train)

    # Get the oob error
    oob_error = 1 - RF.oob_score_
    
    # Store it
    oob_list.append(pd.Series({'n_trees': n_trees, 'oob': oob_error}))

rf_oob_df = pd.concat(oob_list, axis=1).T.set_index('n_trees')

rf_oob_df

Now let's plot the resulting oob errors as a function of the number of trees.

In [None]:
sns.set_context('talk')
sns.set_style('white')

ax = rf_oob_df.plot(legend=False, marker='o', figsize=(14, 7), linewidth=5)
ax.set(ylabel='out-of-bag error');

#  ExtraTreesClassifier
### After building this model, we are going to compare out-of-bag errors for the two different types of models.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

EF = ExtraTreesClassifier(oob_score=True, 
                          random_state=42, 
                          warm_start=True,
                          bootstrap=True,
                          n_jobs=-1)

oob_list = list()

for n_trees in [15, 20, 30, 40, 50, 100, 150, 200, 300, 400]:
    
    # Use this to set the number of trees
    EF.set_params(n_estimators=n_trees)
    EF.fit(X_train, y_train)

    # oob error
    oob_error = 1 - EF.oob_score_
    oob_list.append(pd.Series({'n_trees': n_trees, 'oob': oob_error}))

et_oob_df = pd.concat(oob_list, axis=1).T.set_index('n_trees')

et_oob_df

It would be better if we create a dataframe with both oob-errors as columns in order to be easier to plot both lines.

In [None]:
oob_df = pd.concat([rf_oob_df.rename(columns={'oob':'RandomForest'}),
                    et_oob_df.rename(columns={'oob':'ExtraTrees'})], axis=1)

oob_df

In [None]:
sns.set_context('talk')
sns.set_style('white')

ax = oob_df.plot(marker='o', figsize=(14, 7), linewidth=5)
ax.set(ylabel='out-of-bag error');

As we can see in the figure above **RandomForest error is lower**, therefore is the best model for our case of study. We could select number of trees = 200 as the model which gave us the lowest oob-error and compute its corresponding error metrics. 

### Now let's select the RandomForest model for 200 trees and calculate error metrics and confusion matrix on the test data set:

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF_200 = RandomForestClassifier(n_estimators=200
          ,oob_score=True 
          ,random_state=42
          ,n_jobs=-1)

RF_200.fit(X_train,y_train)
oob_error200 = 1 - RF_200.oob_score_
oob_error200

In [None]:
y_pred=RF_200.predict(X_test)

# Computing error metrics for n=200:

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, roc_auc_score

cr = classification_report(y_test, y_pred)
print(cr)

score_df = [['accuracy', accuracy_score(y_test, y_pred)],
            ['precision', precision_score(y_test, y_pred)],
            ['recall', recall_score(y_test, y_pred)],
            ['f1', f1_score(y_test, y_pred)],
            ['auc', roc_auc_score(y_test, y_pred)]] 

score_df=pd.DataFrame(score_df,columns=['Error metric','Measurement']).set_index('Error metric')
score_df

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
cm_RF200=confusion_matrix(y_test,y_pred)
cm_RF200

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm_RF200, display_labels=RF_200.classes_)
disp.plot(cmap='Blues')

# The ROC-AUC and precision-recall curves.

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve
sns.set_context('talk')

fig, axList = plt.subplots(ncols=2)
fig.set_size_inches(16, 8)

# Get the probabilities for each of the two categories
y_prob = RF_200.predict_proba(X_test)

# Plot the ROC-AUC curve
ax = axList[0]

fpr, tpr, thresholds = roc_curve(y_test, y_prob[:,1])
ax.plot(fpr, tpr, linewidth=5)
# It is customary to draw a diagonal dotted line in ROC plots.
# This is to indicate completely random prediction. Deviation from this
# dotted line towards the upper left corner signifies the power of the model.
ax.plot([0, 1], [0, 1], ls='--', color='black', lw=.3)
ax.set(xlabel='False Positive Rate',
       ylabel='True Positive Rate',
       xlim=[-.01, 1.01], ylim=[-.01, 1.01],
       title='ROC curve')
ax.grid(True)

# Plot the precision-recall curve
ax = axList[1]

precision, recall, _ = precision_recall_curve(y_test, y_prob[:,1])
ax.plot(recall, precision, linewidth=5)
ax.set(xlabel='Recall', ylabel='Precision',
       xlim=[-.01, 1.01], ylim=[-.01, 1.01],
       title='Precision-Recall curve')
ax.grid(True)

plt.tight_layout()

The feature importances plot. Satisfaction is the biggest predictor of customer churn.

In [None]:
feat=pd.DataFrame(RF_200.feature_importances_,index=feature_cols, columns=['Importance']).sort_values(by='Importance',ascending=False)
ax=feat.plot(kind='bar', figsize=(16,6))
ax.set(ylabel='Feature Importance')
ax.set(xlabel='Features')