In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## <span style='color:Blue'> **Import the Required Libraries** </span>

In [None]:
#import numpy
import numpy as np

#import visualization seaborn 
import seaborn as sns

#import pandas 
import pandas as pd

#import visualization matplotlib
import matplotlib.pyplot as plt

#import random forest classifier
from sklearn.ensemble import RandomForestClassifier

#ignire warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## <span style='color:Blue'> **Read the Dateset** </span>

In [None]:
df = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df.head()

In [None]:
df.info()

## <span style='color:Blue'> **Impute Missing** </span>


In [None]:
df.isnull().any()

### <span style='color:violet'> **1. Target Variable** </span>

We are trying to predict if the client left the company in the previous month. Therefore we have a binary classification problem with a slightly unbalanced target:

* Churn: No - 72.4%
* Churn: Yes - 27.6%


In [None]:
ax = sns.catplot(y='Churn', kind='count', data=df, height=2.6, aspect=2.5)

### <span style='color:violet'> **1. Categorical Feature** </span>

This dataset has 16 categorical features:

* Six binary features (Yes/No)
* Nine features with three unique values each (categories)
* One feature with four unique values


### <span style='color:violet'>1.1 Gender and Age(SeniorCitizen)</span>

In [None]:
def barplot_percentages(feature, orient='v', axis_name="percentage of customers"):
    ratios = pd.DataFrame()
    g = df.groupby(feature)["Churn"].value_counts().to_frame()
    g = g.rename({"Churn": axis_name}, axis=1).reset_index()
    g[axis_name] = g[axis_name]/len(df)
    if orient == 'v':
        ax = sns.barplot(x=feature, y= axis_name, hue='Churn', data=g, orient=orient)
        ax.set_yticklabels(['{:,.0%}'.format(y) for y in ax.get_yticks()])
    else:
        ax = sns.barplot(x= axis_name, y=feature, hue='Churn', data=g, orient=orient)
        ax.set_xticklabels(['{:,.0%}'.format(x) for x in ax.get_xticks()])
    ax.plot()
barplot_percentages("SeniorCitizen")


In [None]:
g = sns.FacetGrid(df, row='SeniorCitizen', col='gender', hue='Churn', height=3.5)
g.map(plt.scatter, 'tenure', 'MonthlyCharges', alpha=0.6)
g.add_legend();

* Gender is not an indicative of churn.
* SeniorCitizens are only 16% of customers, but they have a much higher churn rate: 42% against 23% for non-senior customers.
* There are no special relations between this categorical values and the main numerical features.



### <span style='color:violet'>1.2 Partner and dependents</span>


In [None]:
fig,axis = plt.subplots(1,2,figsize=(12,4))
axis[0].set_title('Has partner')
axis[1].set_title('Has dependents')
axis_y = 'Percentage of customers'

#plot Partner column
gp_partner = df.groupby('Partner')['Churn'].value_counts()/len(df)
gp_partner = gp_partner.to_frame().rename({"Churn":axis_y}, axis=1).reset_index()
ax = sns.barplot(x='Partner', y=axis_y, hue='Churn', data=gp_partner, ax=axis[0])


#plot Dependents column
gp_dependent = df.groupby('Dependents')['Churn'].value_counts()/len(df)
gp_dependent = gp_dependent.to_frame().rename({"Churn":axis_y}, axis=1).reset_index()
ax = sns.barplot(x='Dependents', y=axis_y, hue='Churn', data=gp_dependent, ax=axis[1])


* Customers that doesn't have partners are more likely to churn
* Customers without dependents are also more likely to churn


### <span style='color:violet'>1.3 Phone and Internet Service</span>


There are only two main services: phone and internet but the former has many additionals like online backup and security.

#### Phone services

In [None]:
plt.figure(figsize=(9, 4.5))
barplot_percentages("MultipleLines", orient='h')

#### Internet Service


In [None]:
plt.figure(figsize=(9, 4.5))
barplot_percentages("InternetService", orient='h')

### <span style='color:violet'>1.4 Contract and Payment</span>


In [None]:
plt.figure(figsize=(9, 4.5))
barplot_percentages("PaymentMethod", orient='h')



A few observations:

* Customers with paperless billing are more probable to churn

* The preferred payment method is Electronic check with around 35% of customers. This method also has a very high churn rate

* Short term contracts have higher churn rates


### <span style='color:violet'>1.6 Features Importance</span>


 I've used one-hot encode to encode the categorical features and dropped the 'No' columns for binary features. I've also manually tested a few hyperparameters to get a better model.

The importances are the mean decrease in impurity for each feature across all tree

In [None]:
params = {'random_state': 0, 'n_jobs': 4, 'n_estimators': 5000, 'max_depth': 8}
# One-hot encode
df = pd.get_dummies(df)
# Drop redundant columns (for features with two unique values)
drop = ['Churn_Yes', 'Churn_No', 'gender_Female', 'Partner_No',
        'Dependents_No', 'PhoneService_No', 'PaperlessBilling_No']
x, y = df.drop(drop,axis=1), df['Churn_Yes']
# Fit RandomForest Classifier
clf = RandomForestClassifier(**params)
clf = clf.fit(x, y)
# Plot features importances
imp = pd.Series(data=clf.feature_importances_, index=x.columns).sort_values(ascending=False)
plt.figure(figsize=(10,12))
plt.title("Feature importance")
ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')