In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

# **First create the functions to mapp all the categorical variables**

In [None]:
def mapping(df):
    
    mappers = []
    for col in df.columns:
        if df[col].dtypes == 'O' and col != 'customerID' and col != 'TotalCharges':
            dic = {}
            dic[col] = {}
            values = df[col].unique()
            count = 0
            for value in values:
                dic[col][value] = count
                count += 1
            mappers.append(dic)
        
    return mappers     

In [None]:
def replace_categorical(df, mappers):
    df_mapped = df.copy()
    for i in mappers:
        col = list(i.keys())[0]
        df_mapped[col] = df_mapped[col].replace(list(i.values())[0])

    return df_mapped

In [None]:
def plot_var_percentages (df, var_list):

    n_rows = math.ceil(len(var_list)/3)
    mapper = []
    count_c = 0
    count_r = 0
    for n in range(len(var_list)):
        if count_c <= 2:
            mapper.append((count_r,count_c))
            count_c += 1
        else:
            count_r += 1
            count_c = 0
            
    #fig, axes = plt.subplots(nrows = n_rows,ncols = 3,figsize = (15,12))
    for i,var in enumerate(var_list):
        
        labels = list(df[var].value_counts().index)
        counts = list(df[var].value_counts())
        
        plt.figure(i)
        plt.pie(counts, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
        plt.title(var)
    plt.show    

In [None]:
telecom_cust = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')


In [None]:
telecom_cust.dtypes

# **Use of mapping functions and fix of TotalCharges column**

In [None]:
mappers = mapping(telecom_cust)
df = replace_categorical(telecom_cust, mappers)
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.dropna(inplace=True)
df.dtypes

# **Data exploration**

In [None]:
var_list = telecom_cust.columns[1:-3]
var_list = var_list.drop('tenure')
plot_var_percentages(telecom_cust, var_list)


# **Lets check the Heatmap of correlations. Categorical will be included, although some of them won't be of much use in the study.**

In [None]:
corrMatrix = df.corr()
fig, ax = plt.subplots(figsize=(30,25))
sns.heatmap(corrMatrix,annot=True, annot_kws={'size':12},cmap="GnBu")
plt.show();

# **Personal Atributes of Churn costumers**

In [None]:
attrib_variables = ['gender', 'SeniorCitizen', 'Partner', 'Dependents']

for var in attrib_variables:
    ax1 = sns.catplot(x=var, kind="count", hue="Churn", data=telecom_cust)

# **Service Atributes of Churn costumers**

In [None]:
service_variables = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

for var in service_variables:
    ax1 = sns.catplot(x=var, kind="count", hue="Churn", data=telecom_cust)

# **Prediction**

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix

In [None]:
def preprocessing(df, corr_limit, vars_to_scale):
    
    variables = list(churn_corr.loc[churn_corr>=corr_limit].index)
    df_set = df.loc[:,variables]
    
    for var in vars_to_scale:
        if var in df_set.columns:
            scaler = MinMaxScaler()
            df_set[var] = scaler.fit_transform(df_set[var].values.reshape(-1,1)).reshape(1,-1)[0]
            
    X_train, X_test, y_train, y_test = train_test_split(df_set.iloc[:,:-1], df_set.iloc[:,-1], test_size=0.2, random_state=42)
        
    return X_train, X_test, y_train, y_test
corr_limit = 0.15

In [None]:
# First, let's see what the balance of the dataset is:

plot_var_percentages(df,['Churn'])

As seen, the dataset is highly un-balanced. We must take that into account when we look at the results, either giving more weight to one class or by balancing the data.

In [None]:
# Variable selection for training
# As seen above, all services have strong correlation with each other. 
# We'll select the strongest one of them, ad forget about the others
corrMatrix = df.corr()
churn_corr = corrMatrix['Churn'].abs()

tech_variables = ['InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
vars_to_scale  = ['tenure', 'MonthlyCharges', 'TotalCharges']
highest_corr_service = 'TechSupport'
tech_variables.remove(highest_corr_service)
churn_corr.drop(tech_variables, inplace=True)

# SET 1.
# Seleccion of variables with a correlation higher than the 20%

# SET 1: correl >= 0.2
# RANDOM FOREST:
#    SCORES:
#       Default Params: 74.4%
#       Opt Params.     77.5%
# XGBoost:
#    SCORES:
#       Default Params: 75.4%
#       Opt Params.     77.8%

# SET 2: corrrel >= 0.15
# RANDOM FOREST:
#    SCORES:
#       Opt Params.     79.3%
# XGBoost:
#    SCORES:
#       Opt Params.     78.7%

# SET 2: corrrel >= 0.10
# RANDOM FOREST:
#    SCORES:
#       Opt Params.     78.8%
# XGBoost:
#    SCORES:
#       Opt Params.     78.7%

X_train, X_test, y_train, y_test = preprocessing(df, 0.15, vars_to_scale)

RF = RandomForestClassifier(n_estimators=100,
                            criterion='entropy',
                            max_depth=5,
                            min_samples_leaf=2,
                            min_samples_split=2,
                            class_weight={0:1,1:2})
RF.fit(X_train, y_train)
print('Random Forest\n')
print('RF Score: \n',RF.score(X_test, y_test))
print('F1 Score: \n',f1_score(y_test,RF.predict(X_test)))

plot_confusion_matrix(RF, X_test, y_test)

XGB = XGBClassifier(objective = 'binary:logistic',
                    n_estimators = 20,
                    max_depth = 2,
                    n_jobs = -1)
XGB.fit(X_train, y_train,)
print('XGBoost\n')
print('XGB Score: \n',accuracy_score(y_test, XGB.predict(X_test)))
print('F1 Score: \n',f1_score(y_test,XGB.predict(X_test)))
plot_confusion_matrix(XGB, X_test, y_test)


#parameters = {'n_estimators':[10,15,20]}
#GridSearch = GridSearchCV(XGB, parameters)
#GridSearch.fit(X_train, y_train)
#print(GridSearch.score(X_test, y_test))
#print(GridSearch.best_params_)


As **Random Forest** works better here, let's try changing the weights a bit in favor of the lower side of the scale.

In [None]:
X_train, X_test, y_train, y_test = preprocessing(df, 0.15, vars_to_scale)

RF = RandomForestClassifier(n_estimators=100,
                            criterion='entropy',
                            max_depth=5,
                            min_samples_leaf=2,
                            min_samples_split=2,
                            class_weight={0:1,1:4})
RF.fit(X_train, y_train)
print('Random Forest\n')
print('RF Score: \n',RF.score(X_test, y_test))
print('F1 Score: \n',f1_score(y_test,RF.predict(X_test)))

plot_confusion_matrix(RF, X_test, y_test)

As we can see, if we change the weights, we can get better results on the false negatives.
At this point, it would depend on the companys demands. Wether they want to avoid false negatives at the cost of expending more resources on false positives, or if they want a more balanced result at the cost of losing more clients...