In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from datetime import datetime, timedelta


# For the tree visualization
import pydot
from IPython.display import Image
from six import StringIO

# For the dimensionality reduction
from sklearn.feature_selection import SelectKBest, f_classif

# For the tree models
from sklearn.metrics import log_loss, classification_report, accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
# reading data in bbs_cust_base dataframe variable
bbs_cust = pd.read_csv('/kaggle/input/broadband-customers-base-churn-analysis/bbs_cust_base_scfy_20200210.csv', parse_dates = ['effc_strt_date', 'effc_end_date'])

In [None]:
bbs_cust.head()

In [None]:
bbs_cust.shape

In [None]:
# dropping some of the NaNs for columns we want to keep...
bbs_cust.isnull().sum()

In [None]:
bbs_cust.drop(index = bbs_cust[bbs_cust.ce_expiry.isnull() == True].index, inplace = True)

In [None]:
bbs_cust.shape

In [None]:
bbs_cust.nunique()

In [None]:
# dropping columns that are useless
bbs_cust.drop(['bill_cycl', 'serv_type','Unnamed: 19', 'serv_code'], axis=1, inplace=True)

In [None]:
bbs_cust.head()

In [None]:
# it seems like the dataset is a monthly updated view/snapshot of the customer base over a rolling period of 24 months
bbs_cust.newacct_no.value_counts()

In [None]:
bbs_cust[bbs_cust.newacct_no == '70068143.001.000000062'].index

In [None]:
bbs_cust[bbs_cust.newacct_no == '70068143.003.000072630'].index

In [None]:
bbs_cust[bbs_cust.newacct_no == '70071840.001.000000066'].index

In [None]:
# let's 'slice' the dataset into what appears to be rolling periods (bbs201801, bbs201802, bbs201803 et.) to understand it
bbs201801 = bbs_cust.loc[0:20056, :]
bbs201801[bbs201801.current_mth_churn == 'Y'].count()

In [None]:
bbs201801.tail()

In [None]:
bbs201802 = bbs_cust.loc[20057:40098, :]
bbs201803 = bbs_cust.loc[40099:60341, :]
bbs201804 = bbs_cust.loc[60342:80601, :]
bbs201805 = bbs_cust.loc[80602:101017, :]

In [None]:
bbs201802[bbs201802.current_mth_churn == 'Y'].count()

In [None]:
#let's see if accounts that have churned DURING the month of bbs201801 are still present in the subsequent periods
bbs201801[bbs201801.current_mth_churn == 'Y'].newacct_no

In [None]:
bbs201802[bbs201802.current_mth_churn == 'Y'].newacct_no

In [None]:
bbs201802.head()

In [None]:
bbs201802[bbs201802.newacct_no == '70886221.001.000064589']

In [None]:
bbs201802[bbs201802.newacct_no == '70921663.001.000003193']

In [None]:
bbs201801[bbs201801.newacct_no == '70921663.001.000003193']

In [None]:
bbs_cust.complaint_cnt.unique()

In [None]:
bbs_cust.complaint_cnt.value_counts()

In [None]:
bbs_cust[bbs_cust.complaint_cnt == ' customer/ user pass away'].newacct_no

In [None]:
bbs_cust[bbs_cust.newacct_no == '77808624.001.000027357']

In [None]:
# dropping the 'obvious anomalies'
bbs_cust.drop(index = bbs_cust[bbs_cust.newacct_no == '77808624.001.000027357'].index, axis=0, inplace=True)
bbs_cust.drop(index = bbs_cust[bbs_cust.newacct_no =='73400624.001.000012249'].index, axis=0, inplace=True)
bbs_cust.drop(index = bbs_cust[bbs_cust.newacct_no == '90973314.001.000040979'].index, axis=0, inplace = True)

In [None]:
# cleaning the format of the 'complaint' columns, likely an importance churn influencer
bbs_cust['complaint_cnt'] = bbs_cust.complaint_cnt.astype('int')

In [None]:
bbs_cust.complaint_cnt.value_counts()

In [None]:
# change to integer the Y/N columns
bbs_cust['with_phone_service'] = (bbs_cust.with_phone_service == 'Y').astype('int')
bbs_cust['churn'] = (bbs_cust.churn == 'Y').astype('int')
bbs_cust['current_mth_churn'] = (bbs_cust.current_mth_churn == 'Y').astype('int')

In [None]:
bbs_cust[bbs_cust.complaint_cnt == 7]

In [None]:
bbs_cust[bbs_cust.newacct_no == '70101548.001.000091606']

In [None]:
bbs_cust[bbs_cust.newacct_no == '71704258.001.000005944']

In [None]:
bbs_cust.effc_strt_date.min()

In [None]:
bbs_cust.effc_strt_date.max()

In [None]:
bbs201801[bbs201801.effc_strt_date == bbs_cust.effc_strt_date.max()]

In [None]:
bbs_cust.bandwidth.value_counts()

In [None]:
# the notion of 'secured revenue' par MB is not trivial to show - also the revenue / bandwidth function is not evident 
bbs_cust.groupby('bandwidth').secured_revenue.mean().plot.bar(figsize = (10, 7))

In [None]:
bbs_cust.groupby('newacct_no').secured_revenue.mean().plot(figsize = (9,7))

In [None]:
bbs_cust.secured_revenue.max()

In [None]:
bbs_cust[bbs_cust.secured_revenue == bbs_cust.secured_revenue.max()]

In [None]:
bbs_cust[bbs_cust.newacct_no == '94887999.001.000091376']

In [None]:
bbs_cust[bbs_cust.churn == 1].count()

In [None]:
bbs_cust.term_reas_code.unique()

In [None]:
bbs_cust.term_reas_desc.unique()

In [None]:
bbs_cust.term_reas_code.value_counts()

In [None]:
# 'term_reas_desc' is a mere description of 'term_reas_code' - we can drop the column
bbs_cust.drop('term_reas_desc', axis=1, inplace=True)

In [None]:
#creating groups of know issues
#major_term_reas = {'REV':'customer_related', 'CLB':'customer_related', 'NET':'technical_issue', 'UFSS':'service_issue', 'CUCO':'customer_related', 'EXP':'commercial_issue', 'NU':'customer_related', 'OT':'service_issue',
#       'COVL3':'technical_issue', 'COM15':'service_issue', 'COVL2':'technical_issue', 'OTHS':'other', 'BILP':'commercial_issue', 'UCSH':'service_issue', 'LOSF':'service_issue', 'EXI':'commercial_issue',
#       'PLR':'commercial_issue', 'COVL1':'technical_issue', 'COM10':'service_issue', 'UEMS':'service_issue', 'CUSB0':'customer_related', 'MGR':'technical_issue', 'TRM':'other', 'NCAP':'technical_issue',
#       'NWQU':'technical_issue'}
#bbs_cust.loc[:, 'term_reas_code'] = bbs_cust.term_reas_code.replace(major_term_reas)

In [None]:
# Apparently another set of anomalies to get rid of...
bbs_cust[bbs_cust.churn == 0].term_reas_code.value_counts()

In [None]:
bbs_cust[bbs_cust.churn == 0][bbs_cust.term_reas_code == 'CUCO'].newacct_no.unique()

In [None]:
bbs_cust[bbs_cust.newacct_no == '94578580.001.000075175']

In [None]:
bbs_cust[bbs_cust.churn == 0][bbs_cust.term_reas_code == 'UCSH'].newacct_no.unique()

In [None]:
bbs_cust.drop(index = bbs_cust[bbs_cust.newacct_no == '95445441.001.000086502'].index, axis=0, inplace=True)
bbs_cust.drop(index = bbs_cust[bbs_cust.newacct_no =='94578580.001.000075175'].index, axis=0, inplace=True)
bbs_cust.drop(index = bbs_cust[bbs_cust.newacct_no == '94578580.001.000075631'].index, axis=0, inplace = True)

In [None]:
bbs_cust.drop(index = bbs_cust[bbs_cust.newacct_no == '74467655.001.000076551'].index, axis=0, inplace = True)
bbs_cust.drop(index = bbs_cust[bbs_cust.newacct_no == '94578580.001.000086120'].index, axis=0, inplace = True)
bbs_cust.drop(index = bbs_cust[bbs_cust.newacct_no == '95203683.001.000083647'].index, axis=0, inplace = True)

In [None]:
# 'line_stat' seems to be somehow directly related to churn (all 'CN' are churners, while not all churners are 'CN')
bbs_cust.groupby('line_stat').count().plot.bar(figsize = (10, 7))

In [None]:
bbs_cust[bbs_cust.line_stat == 'CN'][bbs_cust.churn == 1].count()

In [None]:
# Apart from an exception (to suppress), all churners have a 'ce_expiry' <0, while all non-churned have a ce_expiry>0...
bbs_cust[bbs_cust.churn == 0][bbs_cust.ce_expiry < 0].value_counts().to_frame()#.plot.bar(figsize = (9, 9))

In [None]:
bbs_cust[bbs_cust.churn == 1][bbs_cust.ce_expiry >= 0]#.plot.bar(figsize = (9, 9))

In [None]:
bbs_cust.drop(index = bbs_cust[bbs_cust.newacct_no == '94868429.003.000090111'].index, axis=0, inplace = True)

In [None]:
# feature engineering
# we need the following columns to work on: 
# - 'newacct_no' as index
# - 'with_phone_service'
# - 'churn'
# - for each account: max tenure, max complaint_cnt, average secured_revenue [max secured_revenue over tenure, min secured_revenue over tenure?], last bandwidth (re-encoded), first bandwidth (re-encoded)

In [None]:
#creating groups of bandwidth 
bandwidth_groups = {'30M':'low_bandwidth', 
                    '10M':'low_bandwidth', 
                    'BELOW 10M':'low_bandwidth', 
                    '50M':'low_bandwidth', 
                    '100M':'medium_bandwidth', 
                    '100M (FTTO)':'medium_bandwidth',
                    '300M (FTTO)':'high_bandwidth', 
                    '1000M (FTTO)':'high_bandwidth', 
                    '500M (FTTO)':'high_bandwidth'}
bbs_cust.loc[:, 'bandwidth'] = bbs_cust.bandwidth.replace(bandwidth_groups)

In [None]:
bbs_cust.head()

In [None]:
rev_dist = bbs_cust.groupby(['newacct_no', 'bandwidth']).secured_revenue.mean().to_frame().unstack().secured_revenue#mean().to_frame()#.unstack().secured_revenue#value_counts().to_frame('revenue_distrib')#.unstack().bandwidth_distrib 
rev_dist.fillna(value = 0, inplace = True)
rev_dist.columns = ['average_rev_HB', 'average_rev_LB', 'average_rev_MB']
rev_dist.head()

In [None]:
rev_dist.shape

In [None]:
ten = bbs_cust.groupby('newacct_no').tenure.max().to_frame('max_tenure')
ten.head()

In [None]:
comp = bbs_cust.groupby('newacct_no').complaint_cnt.max().to_frame('nbr_complaints')
comp.head()

In [None]:
#rev = bbs_cust.groupby('newacct_no').secured_revenue.mean().to_frame('mean_rev')
#rev.head()

In [None]:
#bbs_cust.groupby('newacct_no').secured_revenue.min().to_frame('min_rev')

In [None]:
band = bbs_cust.groupby('newacct_no').bandwidth.value_counts().to_frame('bandwidth_distrib').unstack().bandwidth_distrib 
band.fillna(value = 0, inplace = True)
band.columns = ['tenure_HB', 'tenure_LB', 'tenure_MB']
band.head()

In [None]:
#bandw = {'high_bandwidth':'period_%_high_bandwidth', 'low_bandwidth':'period_%_low_bandwidth', 'medium_bandwidth':'period_%_medium_bandwidth'}
#band['term_reas_code'] = bbs_cust.term_reas_code.replace(major_term_reas)

In [None]:
churn = bbs_cust.groupby('newacct_no').churn.last().to_frame('churn')
churn.head()

In [None]:
phone = bbs_cust.groupby('newacct_no').with_phone_service.last().to_frame('with_phone_service')
phone.head()

In [None]:
contract = bbs_cust.groupby('newacct_no').contract_month.last().to_frame('contract_month')
expiry = bbs_cust.groupby('newacct_no').ce_expiry.last().to_frame('ce_expiry')

In [None]:
data = pd.concat((comp, rev_dist, band, contract, phone, churn), axis=1)
data.head()

In [None]:
data.shape

In [None]:
data = data.drop_duplicates()
data.shape

In [None]:
sns.heatmap(data.corr().round(2), annot = True, cmap = "plasma")

In [None]:
X = data.drop('churn', axis=1)
y = data.churn

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.75, 
                                                    test_size=0.25,
                                                    shuffle=True, 
                                                    stratify=data.churn)

In [None]:
y_train.head()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
# Logictic Regression with sklearn
#from sklearn.linear_model import LogisticRegression
#lr = LogisticRegression()
#lr.fit(X_train,y_train)
#lr_prediction = lr.predict(X_test)
#print("test accuracy {}".format(lr.score(x_test,y_test)))

In [None]:
# Decision Tree 
dt_model_1 = DecisionTreeClassifier().fit(X_train,y_train)
#dt_prediction = dt.predict(X_test)
pd.Series(dt_model_1.feature_importances_, index=X_train.columns).sort_values()\
    .plot.barh(figsize=(4, 10), rot=0, title='Feature importances')
#print("decison tree score : ",dt.score(X_test,y_test))


In [None]:
dt_model_1.get_n_leaves()

In [None]:
y_train_pred = pd.DataFrame(dt_model_1.predict(X_train), 
                            columns=dt_model_1.classes_)
y_train_pred

In [None]:
y_train_predict = dt_model_1.predict(X_train)                     
#y_train_predict = dt_model_1.predict(X_train) 
y_train_predict

In [None]:
cm = confusion_matrix(y_true=y_train,
                  y_pred=y_train_predict)
pd.DataFrame(cm,
             index=dt_model_1.classes_,
             columns=dt_model_1.classes_)

In [None]:
print(classification_report(y_true=y_train,
                            y_pred=y_train_predict))

In [None]:
y_test_predict = dt_model_1.predict(X_test)
print("decison tree score : ",dt_model_1.score(X_test,y_test))

In [None]:
cm = confusion_matrix(y_true=y_test,
                  y_pred=y_test_predict)
pd.DataFrame(cm,
             index=dt_model_1.classes_,
             columns=dt_model_1.classes_)

In [None]:
print(classification_report(y_true=y_test,
                            y_pred=y_test_predict))

In [None]:
dt_model_2 = DecisionTreeClassifier(min_samples_leaf=40, 
                                    min_weight_fraction_leaf=0.01)

In [None]:
my_cv = StratifiedShuffleSplit(n_splits=10, train_size=0.7, test_size=0.3)

In [None]:
#-cross_val_score(dt_model_2, X_train, y_train, cv=my_cv, scoring='neg_log_loss')

In [None]:
my_param_grid = {'min_samples_leaf': [20, 40, 60],
                 'min_weight_fraction_leaf': [0.01, 0.02, 0.05],
                 'criterion': ['gini', 'entropy'], 
                 'min_impurity_decrease': [1e-5, 1e-6, 1e-7]}

In [None]:
dt_model_gs = GridSearchCV(estimator=dt_model_2, 
                           param_grid=my_param_grid, 
                           cv=my_cv, 
                           scoring='neg_log_loss')

In [None]:
dt_model_gs.fit(X_train, y_train)

In [None]:
dt_model_3 = dt_model_gs.best_estimator_

In [None]:
print(classification_report(y_true=y_test, y_pred=dt_model_3.predict(X_test)))

In [None]:
data.info()

In [None]:
# Random forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100,random_state = 1)
rf.fit(x_train, y_train)
rf_prediction = rf.predict(x_test)
print("Random forest algor. result: ",rf.score(x_test,y_test))

In [None]:
# KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3) #n_neighbors = k
knn.fit(x_train,y_train)
knn_prediction = knn.predict(x_test)