In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.





**Data from IBM Sample Datasets**

Each row represents a customer, each column contains customer's attributes described on the column metadata

# **What Contributes to Churn?**

Hypothesis:
- Customers will naturally churn due to low quality service
- Customers will also churn if they no longer need or can support the amount

Client Related Hypotheses:
- Clients younger than 30 years old are more likely to churn
- Clients with customer complaits are more likely to churn
- Clients with DSL internet are more likely to churn
- Clients with multiple subscriptions are less likely to churn
- Clients with Large total Paymensts are less likely to churn
- Clients without internet service are less likely to churn
- Clients with dependents are less likely to churn
- Clients with longer tenure are less likely to churn


Plan Related Hypotheses:
- Clients with Monthly Plans are most likely to churn
- Clients Plans that are paid via automatic means are less likely to churn
- Clients that uses paperless billing are less likely to churn


# **1. Data Exploration**
- Univariate Analysis
We exlore the dataset and identify missing values. We check for the data and analyze distributions where applicable.

In [None]:
from os import walk
for (dirpath, dirnames, filenames) in walk("../input"):
    print("Directory path: ", dirpath)
    print("Folder name: ", dirnames)
    print("File name: ", filenames)


In [None]:
## Read the data set:
ChurnData = pd.read_csv(r'../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

## Explore the data
ChurnData.head()

In [None]:
# Basic EDA
print("Data Shape: ", ChurnData.shape)
print("\n Describe: \n", ChurnData.describe())
print("\n Features: \n", ChurnData.columns)


print("\n Missing Values: ", ChurnData.isnull().sum().values.sum())



print("\n Data Types: \n",ChurnData.dtypes)
print("\n Unique: \n", ChurnData.nunique())





Next we try to use Pandas Profiling for EDA

In [None]:
import pandas_profiling

ChurnData.profile_report()

## Notice Interesting insights from EDA:

* Total charges looks like a continuous number but it only has 6531 distinct values compared to dataset size of 7043. Let's check if there is some 'missing' values in this column

* Some Variables have categories: {Yes, No, No Internet Service}. Internet Service might be a confounding variable. Do we need to recode 'No Internet Service' as 'No' ?


# **2. Data Cleaning / Transformation**

In [None]:
# Check the Total Charges Column:

ChurnData[ChurnData['TotalCharges'].isnull()]
ChurnData[ChurnData['TotalCharges']== " "]

We figured out that there are " " whitespace in this field. 
it covers 11 / 7043 = .15% of the dataset 
we need to treat this - in this case, since it is a very little fraction, we can exclude them 

In [None]:
# Code to Exclude Those with " " Total Charges from the dataset:
ChurnData = ChurnData[ChurnData['TotalCharges'] != " "]
ChurnData.shape

Notice that the dataset size has been reduced. Next we transform the variables into the correct dtypes. Notice that the the TotalCharges column is an object when it looks like a decimal number:

In [None]:
# Run the Pandas profiling again:
#ChurnData.profile_report()

Total Charges is stil Categorical. Let's transform it into a numeric data type:

In [None]:
ChurnData['TotalCharges'] = ChurnData['TotalCharges'].astype(float)

In [None]:
# CHeck that now it is a float:
ChurnData.dtypes

In [None]:
# Recode the Values in columns:
cols_to_trans = ['MultipleLines','OnlineSecurity', 'OnlineBackup','DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

In [None]:
print(ChurnData.OnlineSecurity.unique())
print(ChurnData.MultipleLines.unique())


In [None]:
# Let's define a function that will recode a column in a dataframe. Remember that a dataframe column is essentially a series
def recode_No(input_series):
    if input_series in ['No internet service','No phone service']:
        return 'No'
    else:
        return input_series

In [None]:
ChurnData['OnlineSecurity'] = ChurnData['OnlineSecurity'].apply(recode_No)
ChurnData['OnlineSecurity'].unique()

In the previous code, we defined a function that will recode the values of some columns to 'No'. Let's apply that to all columns that needs that transformation:



In [None]:
for cols in cols_to_trans:
    ChurnData[cols] = ChurnData[cols].apply(recode_No)
    print(cols, ' : ', ChurnData[cols].unique() )

Now that we cleaned the dataset, let's visualize the data to gain more insights:

# 3. Data Visualization

In [None]:
%matplotlib inline 

from IPython.display import Image
import matplotlib as mlp
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sklearn
import seaborn as sns

In [None]:
#ChurnData.columns

y = ChurnData['Churn'].value_counts()
print(y)
sns.barplot(y.index, y.values)

print('Churn Rate: ', ChurnData[ChurnData['Churn'] == 'Yes'].shape[0]/ChurnData.shape[0]*100 )

Notice that we have a somewhat unbalanced data i.e the event rate is much lower. We might consider SMOTE / ROSE algorithms later.

In [None]:
ChurnData.groupby(['InternetService','Churn']).size().unstack().plot(kind = 'bar', stacked = True)
ChurnData.groupby(['Contract','Churn']).size().unstack().plot(kind = 'bar', stacked = True)

We can use the above visualizations for exploring possible variables that greatly affects the Churn Rate

# 4. Model Building
In this section, we fit models to our dataset to predict Customer churn. Our goal is to derive:
* rules
* Variable Importance
* Predictions

In [None]:
ChurnData.head()

In [None]:
## Load Libraries:

#from sklearn import cross_validation
from sklearn import tree
from sklearn import svm
from sklearn import ensemble
from IPython.display import Image
#from sklearn import neighbors
#from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

### Data Preparation for modelling
* Transform variables into required format (Our data is mostly Categorical)


In [None]:
ADS = ChurnData.copy()
# Drop customerID
ADS = ADS.drop(['customerID'], axis = 1) #axis = 1 because it is a column
ADS.head()

Preprocess the data to handle categorical values:

In [None]:
# 
label_encoder = preprocessing.LabelEncoder()

ADS.head()

In [None]:
#ADS.gender.dtype
encode_cols = ADS.columns.drop(['tenure','MonthlyCharges', 'TotalCharges'])
encode_cols

# Research more on list comrehensions and datatype object, might need to use numpy classes here:
#encode_cols = [for i in ADS.columns if DS.columns.dtype != 'int64']

#ADS['gender'] = label_encoder.fit_transform(ADS['gender'])

for cols in encode_cols:
    ADS[cols] = label_encoder.fit_transform(ADS[cols])
    print(cols," : ",ADS[cols].dtype)


In [None]:
#ADS.dtypes
ADS.head()

### Partition the dataset into Features and Target Variable as well as train and test splits

In [None]:
from sklearn.model_selection import train_test_split

print(ADS.columns)

X = ADS.drop(['Churn'], axis = 1)
y = ADS.Churn

print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 100)

print('X_train : ', X_train.shape)
print('X_test : ', X_test.shape)
print('y_train : ', y_train.shape)
print('y_test : ', y_test.shape)

### We will now begin training models:

* Decision Tree Baseline Model:

In [None]:
#Classifier = clf

# Create Classifer Object
clf = tree.DecisionTreeClassifier()

# Train the classifier
clf = clf.fit(X_train,y_train) # fit(Features, Labels)

# Predict Response for the test Dataset
y_pred = clf.predict(X_test)


In [None]:
y_pred

Let's evaluate the model:

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))




Print the Confusion Matrix:


In [None]:
DecisionTree_conf_matrix = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(DecisionTree_conf_matrix, annot=True,  fmt='');
title = 'DecisionTree'
plt.title(title);

Visualize the Tree:

In [None]:
print(clf) 

# Create DOT data
dot_data = tree.export_graphviz(clf, out_file='tree.dot'
                                #, 
                                #feature_names=X.columns,  
                                #class_names=y.columns
                               )

# Draw graph
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')