In [None]:
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import chi2_contingency
from scipy.stats import chisquare
import scipy.stats as ss

from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
print(df.columns)

print(df.dtypes)

Number of missing values or spaces

In [None]:
for column in df.columns:
    
    print(column)
    
    print(df[df[column] == ' '].index)

Remove rows with spaces

In [None]:
print("Tenure of customers with zero total charges: ",df[df["TotalCharges"]==" "]["tenure"])
# These customers seem to have registered but not started with their service to be evaluated. Hence, they will be excluded
df = df[df["TotalCharges"]!=" "] 

- EDA for most recognisable fields
  - Gender 
  - Senior citizen
  - Partner
  - Dependents
  - Tenure
  - Phone service
  - Multiple Lines
  - Internet service
  - Online security
  - Online backup
  - Device protection
  - Tech Suppport
  - Streaming TV
  - Streaming movies
  - Contract
  - Paper less Billing
  - Payment method
  

In [None]:
for column in df.columns:
    
    print("For "+ column+ ":" + str(df[column].nunique()) + "unique values")

- EDA for features for 5 or less than 5 unique values


In [None]:
columns_5 = []

for column in df.columns:
     
    if df[column].nunique() <=5 and column!="Churn":
        
        columns_5.append(column)
        

In [None]:
Bar_Pred = sns.countplot(x='Churn',data=df)
plt.show()

Overall lower frequency of churn in the dataset

In [None]:
fig, axs = plt.subplots(4, 4, sharex=False, sharey=True, figsize=(30,30))

count_row = 0
count_columns = 0
for column in columns_5:    
    
    
    Bar_Plot = sns.countplot(x=column,hue='Churn',data=df,ax = axs[count_row][count_columns]).set_title("Frequeny distribution for: " + str(column))
        
   
    count_columns +=1
    
    if count_columns == 4:
        count_row+=1
        count_columns=0

### Although freuquency of churn is generally lower, let's look at some significant results
```
1. Non senior citizens are less likely to churn propprtionally to senior citizens   
2. Presence of a phone service and tech support play an important role in preventing customer churn
3. Contracts without paperless billing are less likely to churn than paperless billing  
4. Customers with DSL service are propotionally less likely to churn than with fiber optic

```



### Let's look at how these categorical features are correlated

In [None]:
df_corr = df[columns_5]

# Cramers V for categorical correlations
def cramers_v(x, y):
    x = np.array(x)
    y = np.array(y)
    confusion_matrix = pd.crosstab(x, y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

cramersv = pd.DataFrame(index=df_corr.columns,columns=df_corr.columns)
columns = df_corr.columns

for i in range(0,len(columns)):
    for j in range(0,len(columns)):
        #print(data[columns[i]].tolist())
        u = cramers_v(df_corr[columns[i]].tolist(),df_corr[columns[j]].tolist())
        cramersv.loc[columns[i],columns[j]] = u
        
cramersv.fillna(value=np.nan,inplace=True)


In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(cramersv,cmap="YlGnBu")
plt.show()

- Apart from the expected correlation between partners and dependants, there is a strong correlation between the phone service and presence of multiple lines (or multiple phone numbers for the same customer). My guess is that a good service prompts customers to order multiple lines
- Also users of internet security avail other internet based features
- Let's look at phone service a bit more closely

In [None]:

df_PS = df[["PhoneService","MultipleLines"]]
Total_yes_multiple_lines = len(df_PS[(df_PS["MultipleLines"]=="Yes")])



# Check for relation between good phone service and multiple lines
df_CS_corr = df_PS[(df_PS["PhoneService"]=="Yes") & (df_PS["MultipleLines"]=="Yes")]

print("Percentage of users with multiple lines having good phone service " + str(len(df_CS_corr)*100/Total_yes_multiple_lines) + " %") 


- This confirms the fact that good service leads to multiple subscriptions

### Lets look at the relation between tenure to churn for customers who have terminated their service

In [None]:
df_churn = df[df["Churn"]=="Yes"]

In [None]:
sns.histplot(df_churn["tenure"])


- Highest frequency of churn in the first 10 months. Customers make up their mind within the first 10 months to terminate or continue their service

In [None]:
sns.histplot(df_churn["MonthlyCharges"])

- Monthly charges between 60 and 110 have a high frequency count but can not infer any significant meaning

### Let's look at the prediction by evaluating three models
```
1. Light gbm
2. Random forest
3. Logistic regression
```

In [None]:
df['TotalCharges'] = df['TotalCharges'].apply(float)

# Rather than label encoding, we can convert the categorical values to dummy variables. 
# This can help evaluate individual fetures directly
y = df["Churn"]
df = df.drop(columns=["Churn","customerID"])

df = pd.get_dummies(df)
print(df.head(10))

X = df
y = y.values
y = y.reshape((len(y), 1))




In [None]:
# split into train and test sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

- Random Forest

In [None]:
clf=RandomForestClassifier()

param_test={
        'n_estimators': range(200,500,100), 
        'min_samples_leaf': range(10,20,10), 
        'min_samples_split': range(10,20,10),  
        'max_depth': range(40,60,10)       
    }
            
model = RandomizedSearchCV(
  estimator=clf, param_distributions=param_test, 
   scoring='accuracy',
   random_state=314,
   verbose=True, n_iter=10)
result = model.fit(X_train, y_train)
            
print("Best Score " + str(result.best_score_)+" with paramter "+ str(result.best_params_))

model1 = RandomForestClassifier(n_estimators=result.best_params_['n_estimators'],
                              min_samples_split=result.best_params_['min_samples_split'],
                              min_samples_leaf=result.best_params_['min_samples_leaf'], 
                              max_depth=result.best_params_['max_depth'])

model1.fit(X_train, y_train)

pred = model1.predict(X_test)
print('Accuracy:',accuracy_score(pred,y_test))

- Light GBM

In [None]:
clf= LGBMClassifier()

param_test ={  'n_estimators': [400, 700, 1000],
  'colsample_bytree': [0.7, 0.8],
  'max_depth': [15,20,25],
   'num_leaves': [50, 100, 200],
  'reg_alpha': [1.1, 1.2, 1.3],
 'reg_lambda': [1.1, 1.2, 1.3],
 'min_split_gain': [0.3, 0.4],
 'subsample': [0.7, 0.8, 0.9],
  'subsample_freq': [20]}

model = RandomizedSearchCV(
  estimator=clf, param_distributions=param_test, 
   scoring='accuracy',
   random_state=314,
   verbose=True, n_iter=10)
result = model.fit(X_train, y_train)
            
print("Best Score " + str(result.best_score_)+" with paramter "+ str(result.best_params_))

model2 = LGBMClassifier(subsample_freq= result.best_params_['subsample_freq'],subsample= result.best_params_['subsample'], 
                       reg_lambda= result.best_params_['reg_lambda'], reg_alpha = result.best_params_['reg_alpha'], 
                       num_leaves = result.best_params_['num_leaves'], n_estimators = result.best_params_['n_estimators'], 
                       min_split_gain = result.best_params_['min_split_gain'], max_depth = result.best_params_['max_depth'], 
                       colsample_bytre = result.best_params_['colsample_bytree'])

model2.fit(X_train, y_train)

pred = model2.predict(X_test)
print('Accuracy:',accuracy_score(pred,y_test))

- Logistic regression

In [None]:
model3 = LogisticRegression()
model3.fit(X_train, y_train)

pred = model3.predict(X_test)
print('Accuracy:',accuracy_score(pred,y_test))

- With similar performances, lets find out the important features contributing to the model performance (Random forest classifier)

In [None]:
plt.figure(figsize=(20,10))
feat_importances = pd.Series(model1.feature_importances_, index=X_train.columns)
feat_importances.nlargest(6).plot(kind='barh')

- Tenure, Contract_Month to Month, Montly Charges, Total charges and Tech support no seem to contribute the most to the churn (Top 5)
- From the distribution plots, it can be inferred that customers make up their mind to terminate within the first 10 months
- Also, customers with a two year contract are less likely to churn than a month to month contract
- Additionaly customer support are good measures to prevent the company from losing customers. Also as inferred above, customer support enables multiple subscribers

### As always would love to receive feedback on any improvement or mistake