In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from scipy import stats,special
import matplotlib.pyplot as plt
#import scipy.io as io
import glob

from datetime import datetime as dt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler

import warnings
warnings.simplefilter('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Model and Understand Customer Churn

### Procedure

1. Data cleaning

- deal with missing values and/or outliers
- EDA to understand demographics - are non-senior citizens >> senior citizens etc
- EDA to understand service types & prevalence
- feature engineering - what ways can the data be simplified? How to make given features more interpretable?

2. Initial question & method: 
- What factors contribute to customer churn?
- Get intuition from EDA
- Estimate feature importance (PCA or initial classification model on whole dataset)
- Start with simple logistic regression

3. Questions raised by EDA:
- By grouping features, such as phone services, internet services, into 'packages', do these become more significant?

4. Improvements
- would be good to add k-fold cross-validation
- try SVM (kernel choice is significant)
- Do PCA results agree with most important features selected by decision tree model or gradient boost model?
 

## Load & Prepare Data

In [None]:
df0=pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df0.info()

In [None]:
#TotalCharges being object instead of float implies existence of missing values
#tc=pd.to_numeric(df0.TotalCharges) #causes ValueError: Unable to parse string " "

In [None]:
# convert " " to 0
df0['TotalCharges'] = df0['TotalCharges'].replace(" ", 0).astype('float32')
df0['TotalCharges']=pd.to_numeric(df0.TotalCharges)

In [None]:
ocols=df0.select_dtypes(object)
ocols

In [None]:
#yes/no to binary
#'No internet service' can also be set to zero
#or separated to different category
ynkeys=[k for k in list(ocols.keys()) if ocols[k][0] in ['Yes','No']]
for k in ynkeys:
    df0[k].replace(('Yes', 'No','No internet service'), (1, 0, 0), inplace=True)

In [None]:
#categorical to one-hot
df=pd.get_dummies(df0,columns=['gender','MultipleLines','InternetService','Contract','PaymentMethod'])
df.head()

In [None]:
df.describe()

## Churn

In [None]:
df.Churn.sum(),df.Churn.count()

In [None]:
churn=df.where(df.Churn == 1).dropna(how='all')
nonchurn=df.where(df.Churn == 0).dropna(how='all')

### Numeric columns (tenure, mothly charges, total charges)

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=churn.tenure,name='churn'))
fig.add_trace(go.Histogram(x=nonchurn.tenure,name='retain'))
fig.update_layout(barmode='overlay',xaxis_title='tenure (months)',title='Tenure')
fig.update_traces(opacity=0.6)

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=churn.MonthlyCharges,name='churn'))
fig.add_trace(go.Histogram(x=nonchurn.MonthlyCharges,name='retain'))
fig.update_layout(barmode='overlay',xaxis_title='monthly charges (USD)',title='Monthly Charges')
fig.update_traces(opacity=0.6)

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=churn.TotalCharges,name='churn'))
fig.add_trace(go.Histogram(x=nonchurn.TotalCharges,name='retain'))
fig.update_layout(barmode='overlay',xaxis_title='total charges (USD)',title='Total Charges')
fig.update_traces(opacity=0.6)

In [None]:
#Total charges is not that informative, have to divide by tenure to make it meaningful. Then it should be comparable to monthly charges unless something else is going on...
fig = go.Figure()
fig.add_trace(go.Histogram(x=churn.TotalCharges/churn.tenure,name='churn'))
fig.add_trace(go.Histogram(x=nonchurn.TotalCharges/nonchurn.tenure,name='retain'))
fig.update_layout(barmode='overlay',xaxis_title='Est. Monthly charges (USD)',title='Total Charges/Tenure')
fig.update_traces(opacity=0.6)

In [None]:
#any difference between this and actual monthly charges?
df['excess_monthly_charges']=df.MonthlyCharges - (df.TotalCharges/df.tenure)

In [None]:
#kaggle goes line-by-line so duplicate this here
churn=df.where(df.Churn == 1).dropna(how='all')
nonchurn=df.where(df.Churn == 0).dropna(how='all')

In [None]:
#re-run churn/non-churn cell then plot...
fig = go.Figure()
fig.add_trace(go.Histogram(x=churn.excess_monthly_charges,name='churn'))
fig.add_trace(go.Histogram(x=nonchurn.excess_monthly_charges,name='retain'))
fig.update_layout(barmode='overlay',xaxis_title='Excess monthly charges (USD)',title='Monthly Charges - Total Charges/Tenure')
fig.update_traces(opacity=0.6)

In [None]:
#what's going on with $0 charges? free trial?
zerocharge=df.where(df.TotalCharges ==0).dropna(how='all')
zerocharge.describe() #okay there are only 11 values like this. we can get rid of them

**Remarks**

- Low tenure more likely to churn
- higher charges more likely to churn

### Demographics (people)

In [None]:
demkeys=['gender_Male','gender_Female','SeniorCitizen','Partner','Dependents']

In [None]:
#churn and non-turn totals
churn_sums=[churn[k].sum() for k in demkeys] 
nonchurn_sums=[nonchurn[k].sum() for k in demkeys]

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=demkeys,y=churn_sums,name='churn'))
fig.add_trace(go.Bar(x=demkeys,y=nonchurn_sums,name='retain'))
fig.update_layout(title='Customer Demographics')

**Remarks**
- Having a partner or dependents seems to correlate with low churn
- Seniors have high churn but there are also not too many of them



## Product/Services

In [None]:
pkeys=['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

In [None]:
#churn and non-turn totals
churn_psums=[churn[k].sum() for k in pkeys] 
nonchurn_psums=[nonchurn[k].sum() for k in pkeys]

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=pkeys,y=churn_psums,name='churn'))
fig.add_trace(go.Bar(x=pkeys,y=nonchurn_psums,name='retain'))
fig.update_layout(title='Products')

**Remarks**
- Streaming has higher than average churn



In [None]:
skeys=['PhoneService','MultipleLines_No', 'MultipleLines_No phone service','MultipleLines_Yes', 'InternetService_DSL','InternetService_Fiber optic', 'InternetService_No']

In [None]:
#churn and non-turn totals
churn_ssums=[churn[k].sum() for k in skeys] 
nonchurn_ssums=[nonchurn[k].sum() for k in skeys]

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=skeys,y=churn_ssums,name='churn'))
fig.add_trace(go.Bar(x=skeys,y=nonchurn_ssums,name='retain'))
fig.update_layout(title='Services')

**Remarks**
- Fiber internet seems to be unsatisfactory - high churn rate
- Combinations of services here is interesting. Can make columns to describe how many phone/internet services the customer has cumulatively
- Might be redundant for modeling (or could be used to get rid of likely-colinear features) but informative for EDA



## Financial

In [None]:
fkeys=['Contract_Month-to-month', 'Contract_One year', 'Contract_Two year','PaymentMethod_Bank transfer (automatic)','PaymentMethod_Credit card (automatic)','PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',]

In [None]:
#churn and non-turn totals
churn_fsums=[churn[k].sum() for k in fkeys] 
nonchurn_fsums=[nonchurn[k].sum() for k in fkeys]

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=fkeys,y=churn_fsums,name='churn'))
fig.add_trace(go.Bar(x=fkeys,y=nonchurn_fsums,name='retain'))
fig.update_layout(title='Financial')

**Remarks**
- monthly contracts have very high churn rate
- electronic checks result in high churn rate

### Final data preparation for model
- get rid of zeros
- drop redundant columns

In [None]:
df=df.where(df.TotalCharges != 0).dropna(how='all')
df.drop(columns=['TotalCharges','gender_Female','MultipleLines_No','customerID'],inplace=True)

In [None]:
## Correrlation
#from visualization_tools import corr_plot #have to copy to kaggle
import seaborn as sns
def corr_plot(df):
    corr = df.corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))
    f, ax = plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
    return f

In [None]:
cp=corr_plot(df)

In [None]:
X=df.drop(columns=['Churn'])
y=df.Churn

In [None]:
#check that everything's numeric
X.info()

# Modeling Customer Churn

## Understanding key features
 - from EDA, expect that Tenure, MonthlyCharges, contract type, InternetService_Fiber, payment method to be important


### Identify most important features via RandomForest

run on the entire data set

In [None]:
clf=RandomForestClassifier(n_jobs=4,n_estimators=5000,max_depth=8)
clf.fit(X,y)

In [None]:
fscores = pd.DataFrame({'feature':X.columns,'importance':clf.feature_importances_}).sort_values(by='importance',ascending=True)
fscores.tail()

In [None]:
fig=go.Figure()
fig.add_traces(go.Bar(x=fscores.importance))
fig.update_yaxes(ticktext=fscores.feature,tickvals=list(range(27)))
#fig.add_traces(go.Bar(x=fm))
fig.update_layout(xaxis_title='Feature Importance', title='Feature Importance via RandomForest')
fig.update_layout(yaxis={'categoryorder':'total ascending'})

**Remarks**

This was all expected given the EDA. 

Start with building a very simple model based on top 8 features

In [None]:
Xsmall=X[['tenure','Contract_Month-to-month','MonthlyCharges','InternetService_Fiber optic','PaymentMethod_Electronic check','Contract_Two year','excess_monthly_charges','InternetService_No']]

In [None]:
#just in case
scaler=StandardScaler()
Xsmall=scaler.fit_transform(Xsmall)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(Xsmall,y,test_size=0.33,random_state=2)

In [None]:
 model=LogisticRegression(penalty='l2', C=1,class_weight="balanced")
 model.fit(X_train, y_train)

In [None]:
y_pred=model.predict(X_test)
probs=model.predict_proba(X_test)[:,1]

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, probs)
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]
nfpr, ntpr, thresholds = roc_curve(y_test, ns_probs)

In [None]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=fpr,y=tpr,name='Logistic Regression'))
fig.add_trace(go.Scatter(x=nfpr,y=ntpr,name='Majority Class Prediction'))
fig.update_layout(xaxis_title='False Positive Rate',yaxis_title='True Positive Rate')

In [None]:
score=accuracy_score(y_test, y_pred)
roc=roc_auc_score(y_test, y_pred)
cmat=confusion_matrix(y_test,y_pred)
print('Accuracy Score: %.3f' % score)
print('ROC AUC Score: %.3f' % roc)

In [None]:
print(cmat)

**Remarks**
- newest update of sklearn enables L1 regression (preferred for sparsity usually) and ElasticNet
- Other hyperperameter to optimize is C = 1/regularization strength

This model, although not of particularly high accuracy, should already be quite robust due to its reliance on a small number of non-collinear, informative features.