In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [None]:
churn = pd.read_csv("telecom_churn_data.csv")

In [None]:
churn.head()

In [None]:
churn.info(verbose=1)

In [None]:
churn.describe(include='all')

In [None]:
#segregate columns
id=['mobile_number', 'circle_id']

date=['last_date_of_month_6', 'last_date_of_month_7','last_date_of_month_8','last_date_of_month_9','date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9','date_of_last_rech_data_6','date_of_last_rech_data_7','date_of_last_rech_data_8','date_of_last_rech_data_9']

cat=['night_pck_user_6','night_pck_user_7','night_pck_user_8','night_pck_user_9','fb_user_6','fb_user_7','fb_user_8','fb_user_9']

num=[column for column in df.columns if column not in id + date + cat]

In [None]:
#checking missing values in each column
churn.isnull().sum()*100/churn.shape[0]

In [None]:
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 300)

In [None]:
df.isnull().sum()*100/df.shape[0]

In [None]:
#data imputation

In [None]:
recharge=['total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'total_rech_data_9', 'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_2g_9', 'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8', 'count_rech_3g_9','max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'max_rech_data_9','av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'av_rech_amt_data_9']
churn[recharge].describe(include='all')

In [None]:
# recharge date and the recharge value are missing,i.e,customer didn't recharge
churn.loc[df.total_rech_data_6.isnull() & churn.date_of_last_rech_data_6.isnull(), ["total_rech_data_6", "date_of_last_rech_data_6"]].head(20)

In [None]:
# recharge columns for zero impuitation
zero=['total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'total_rech_data_9','av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'av_rech_amt_data_9', 'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'max_rech_data_9']

In [None]:
churn[zero] = churn[zero].apply(lambda x: x.fillna(0))

In [None]:
print(churn[zero].isnull().sum()*100/churn.shape[1])
print(churn[zero].describe(include='all'))


In [None]:
#NaN replacement for catagories
churn[cat] = churn[cat].apply(lambda x: x.fillna(-1))

In [None]:
print(churn[cat].isnull().sum()*100/churn.shape[0])

In [None]:
#dropping variables with missing values>threshold(0.7)
initial_cols = churn.shape[1]
thold = 0.7

include_cols = list(churn.apply(lambda column: True if column.isnull().sum()/churn.shape[0] < thold else False))

drop_missing = pd.DataFrame({'features':churn.columns , 'include': include_cols})
drop_missing.loc[drop_missing.include == True,:]

In [None]:
churn = churn.loc[:, include_cols]

dropped_cols = churn.shape[1] - initial_cols
print("{0} columns dropped.".format(dropped_cols))

In [None]:
#total data recharge amount(june-july)
churn['total_data_rech_6'] = churn.total_rech_data_6 * churn.av_rech_amt_data_6
churn['total_data_rech_7'] = churn.total_rech_data_7 * churn.av_rech_amt_data_7

In [None]:
#total recharge(june-july)
churn['amt_data_6'] = churn.total_rech_amt_6 + churn.total_data_rech_6
churn['amt_data_7'] = churn.total_rech_amt_7 + churn.total_data_rech_7

In [None]:
#average recharge for june july
churn['av_amt_data_6_7'] = (churn.amt_data_6 + churn.amt_data_7)/2

In [None]:
#70th percentile recharge
print(format(churn.av_amt_data_6_7.quantile(0.7)))

In [None]:
churn_filtered = churn.loc[df.av_amt_data_6_7 >= churn.av_amt_data_6_7.quantile(0.7), :]
churn_filtered = churn_filtered.reset_index(drop=True)
churn_filtered.shape

In [None]:
#total incoming and outgoing minutes of usage
churn_filtered['total_calls_mou_9'] = churn_filtered.total_ic_mou_9 + churn_filtered.total_og_mou_9

In [None]:
#2g and 3g data consumption
churn_filtered['total_internet_mb_9'] =  churn_filtered.vol_2g_mb_9 + churn_filtered.vol_3g_mb_9

In [None]:
#churn variable based on september
#0 - not churn, 1 - churn
churn_filtered['churn'] = churn_filtered.apply(lambda row: 1 if (row.total_calls_mou_9 == 0 and row.total_internet_mb_9 == 0) else 0, axis=1)

In [None]:
churn_filtered = churn_filtered.drop(['total_calls_mou_9', 'total_internet_mb_9'], axis=1)

In [None]:
churn_filtered.churn = churn_filtered.churn.astype("category")
print("Churn rate:")
print(churn_filtered.churn.value_counts()*100/churn_filtered.shape[0])

In [None]:
#variable derivation
#difference between 8th and previous months
churn_filtered['arpu_diff'] = churn_filtered.arpu_8 - ((churn_filtered.arpu_6 + churn_filtered.arpu_7)/2)
churn_filtered['onnet_mou_diff'] = churn_filtered.onnet_mou_8 - ((churn_filtered.onnet_mou_6 + churn_filtered.onnet_mou_7)/2)
churn_filtered['offnet_mou_diff'] = churn_filtered.offnet_mou_8 - ((churn_filtered.offnet_mou_6 + churn_filtered.offnet_mou_7)/2)
churn_filtered['roam_ic_mou_diff'] = churn_filtered.roam_ic_mou_8 - ((churn_filtered.roam_ic_mou_6 + churn_filtered.roam_ic_mou_7)/2)
churn_filtered['roam_og_mou_diff'] = churn_filtered.roam_og_mou_8 - ((churn_filtered.roam_og_mou_6 + churn_filtered.roam_og_mou_7)/2)
churn_filtered['loc_og_mou_diff'] = churn_filtered.loc_og_mou_8 - ((churn_filtered.loc_og_mou_6 + churn_filtered.loc_og_mou_7)/2)
churn_filtered['std_og_mou_diff'] = churn_filtered.std_og_mou_8 - ((churn_filtered.std_og_mou_6 + churn_filtered.std_og_mou_7)/2)
churn_filtered['isd_og_mou_diff'] = churn_filtered.isd_og_mou_8 - ((churn_filtered.isd_og_mou_6 + churn_filtered.isd_og_mou_7)/2)
churn_filtered['spl_og_mou_diff'] = churn_filtered.spl_og_mou_8 - ((churn_filtered.spl_og_mou_6 + churn_filtered.spl_og_mou_7)/2)
churn_filtered['total_og_mou_diff'] = churn_filtered.total_og_mou_8 - ((churn_filtered.total_og_mou_6 + churn_filtered.total_og_mou_7)/2)
churn_filtered['loc_ic_mou_diff'] = churn_filtered.loc_ic_mou_8 - ((churn_filtered.loc_ic_mou_6 + churn_filtered.loc_ic_mou_7)/2)
churn_filtered['std_ic_mou_diff'] = churn_filtered.std_ic_mou_8 - ((churn_filtered.std_ic_mou_6 + churn_filtered.std_ic_mou_7)/2)
churn_filtered['isd_ic_mou_diff'] = churn_filtered.isd_ic_mou_8 - ((churn_filtered.isd_ic_mou_6 + churn_filtered.isd_ic_mou_7)/2)
churn_filtered['spl_ic_mou_diff'] = churn_filtered.spl_ic_mou_8 - ((churn_filtered.spl_ic_mou_6 + churn_filtered.spl_ic_mou_7)/2)
churn_filtered['total_ic_mou_diff'] = churn_filtered.total_ic_mou_8 - ((churn_filtered.total_ic_mou_6 + churn_filtered.total_ic_mou_7)/2)
churn_filtered['total_rech_num_diff'] = churn_filtered.total_rech_num_8 - ((churn_filtered.total_rech_num_6 + churn_filtered.total_rech_num_7)/2)
churn_filtered['total_rech_amt_diff'] = churn_filtered.total_rech_amt_8 - ((churn_filtered.total_rech_amt_6 + churn_filtered.total_rech_amt_7)/2)
churn_filtered['max_rech_amt_diff'] = churn_filtered.max_rech_amt_8 - ((churn_filtered.max_rech_amt_6 + churn_filtered.max_rech_amt_7)/2)
churn_filtered['total_rech_data_diff'] = churn_filtered.total_rech_data_8 - ((churn_filtered.total_rech_data_6 + churn_filtered.total_rech_data_7)/2)
churn_filtered['max_rech_data_diff'] = churn_filtered.max_rech_data_8 - ((churn_filtered.max_rech_data_6 + churn_filtered.max_rech_data_7)/2)
churn_filtered['av_rech_amt_data_diff'] = churn_filtered.av_rech_amt_data_8 - ((churn_filtered.av_rech_amt_data_6 + churn_filtered.av_rech_amt_data_7)/2)
churn_filtered['vol_2g_mb_diff'] = churn_filtered.vol_2g_mb_8 - ((churn_filtered.vol_2g_mb_6 + churn_filtered.vol_2g_mb_7)/2)
churn_filtered['vol_3g_mb_diff'] = churn_filtered.vol_3g_mb_8 - ((churn_filtered.vol_3g_mb_6 + churn_filtered.vol_3g_mb_7)/2)

In [None]:
churn_filtered['total_og_mou_diff'].describe()

In [None]:
#9th month columns not required
churn_filtered = churn_filtered.filter(regex='[^9]$', axis=1)
churn_filtered.shape

In [None]:
# removing names ending with 9
col_9_names=churn.filter(regex='9$', axis=1).columns
cat=[col for col in cat if col not in col_9_names]
cat.append('churn')
num=[col for col in churn_filtered.columns if col not in cat]

In [None]:
#data visualisation
churn_filtered[cat] = churn_filtered[cat].apply(lambda column: column.astype("category"), axis=0)

In [None]:
#plotting functions
def data_type(variable):
    if variable.dtype == np.int64 or variable.dtype == np.float64:
        return 'numerical'
    elif variable.dtype == 'category':
        return 'categorical'
    
def univariate(variable, stats=True):
    
    if data_type(variable) == 'numerical':
        sns.distplot(variable)
        if stats == True:
            print(variable.describe())
    
    elif data_type(variable) == 'categorical':
        sns.countplot(variable)
        if stats == True:
            print(variable.value_counts())
            
    else:
        print("Invalid variable passed: either pass a numeric variable or a categorical vairable.")
        
def bivariate(var1, var2):
    if data_type(var1) == 'numerical' and data_type(var2) == 'numerical':
        sns.regplot(var1, var2)
    elif (data_type(var1) == 'categorical' and data_type(var2) == 'numerical') or (data_type(var1) == 'numerical' and data_type(var2) == 'categorical'):        
        sns.boxplot(var1, var2)

In [None]:
univariate(churn.arpu_6)

In [None]:
bivariate(churn_filtered.churn, churn_filtered.aon)

In [None]:
bivariate(churn_filtered.sep_vbc_3g, churn_filtered.churn)

In [None]:
bivariate(churn_filtered.spl_og_mou_8, churn_filtered.churn)

In [None]:
pd.crosstab(churn_filtered.churn, churn_filtered.night_pck_user_8, normalize='columns')*100

In [None]:
pd.crosstab(churn_filtered.churn, churn_filtered.sachet_3g_8)

In [None]:
#modelling
# import required libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from imblearn.metrics import sensitivity_specificity_support

In [None]:
#change churn to numeric
churn_filtered['churn'] = pd.to_numeric(churn_filtered['churn'])

In [None]:
#train test split
X = churn_filtered.drop("churn", axis = 1)
y = churn_filtered.churn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 4, stratify = y)

In [None]:
X_train.shape
y_train.shape
X_test.shape
y_test.shape

In [None]:
#aggregation of categorical variables
train = pd.concat([X_train, y_train], axis=1)

train.groupby('night_pck_user_6').churn.mean()
train.groupby('night_pck_user_7').churn.mean()
train.groupby('night_pck_user_8').churn.mean()
train.groupby('fb_user_6').churn.mean()
train.groupby('fb_user_7').churn.mean()
train.groupby('fb_user_8').churn.mean()

In [None]:
map = {'night_pck_user_6' : {-1: 0.099165, 0: 0.066797, 1: 0.087838},
       'night_pck_user_7' : {-1: 0.115746, 0: 0.055494, 1: 0.051282},
       'night_pck_user_8' : {-1: 0.141108, 0: 0.029023, 1: 0.016194},
       'fb_user_6'        : {-1: 0.099165, 0: 0.069460, 1: 0.067124},
       'fb_user_7'        : {-1: 0.115746, 0: 0.059305, 1: 0.055082},
       'fb_user_8'        : {-1: 0.141108, 0: 0.066887, 1: 0.024463}
      }
X_train.replace(map, inplace = True)
X_test.replace(map, inplace = True)

In [None]:
X_train[[col for col in cat if col not in ['churn']]].info()

In [None]:
#PCA
pca = Pipeline([('scaler', StandardScaler()), ('pca', PCA())])

In [None]:
pca.fit(X_train)
churn_pca = pca.fit_transform(X_train)

In [None]:
pca = pca.named_steps['pca']

# look at explainded variance of PCA components
print(pd.Series(np.round(pca.explained_variance_ratio_.cumsum(), 4)*100))

In [None]:
#feature variance
features = range(pca.n_components_)
cumulative_variance = np.round(np.cumsum(pca.explained_variance_ratio_)*100, decimals=4)
plt.figure(figsize=(175/20,100/20))
plt.plot(cumulative_variance)

In [None]:
#PCA and Logistic Regression
# create pipeline
PCA_VARS = 60
steps = [('scaler', StandardScaler()),
         ("pca", PCA(n_components=PCA_VARS)),
         ("logistic", LogisticRegression(class_weight='balanced'))
        ]
pipeline = Pipeline(steps)

In [None]:
pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)

In [None]:
#TEST DATA EVALUATION
y_pred = pipeline.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
sensitivity, specificity, _ = sensitivity_specificity_support(y_test, y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t", round(specificity, 2), sep='')
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
print("AUC:    \t", round(roc_auc_score(y_test, y_pred_prob),2))

In [None]:
#hyperparameter tuning
y_train.value_counts()/y_train.shape

In [None]:
# PCA
pca = PCA()

logistic = LogisticRegression(class_weight={0:0.1, 1: 0.9})

steps = [("scaler", StandardScaler()), 
         ("pca", pca),
         ("logistic", logistic)
        ]

pca_logistic = Pipeline(steps)

params = {'pca__n_components': [60, 80], 'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 'logistic__penalty': ['l1', 'l2']}

folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

model = GridSearchCV(estimator=pca_logistic, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
model.fit(X_train, y_train)

In [None]:
pd.DataFrame(model.cv_results_)

In [None]:
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)

In [None]:
y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

sensitivity, specificity, _ = sensitivity_specificity_support(y_test, y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t", round(specificity, 2), sep='')

y_pred_prob = model.predict_proba(X_test)[:, 1]
print("AUC:    \t", round(roc_auc_score(y_test, y_pred_prob),2))

In [None]:
#random forest
forest = RandomForestClassifier(class_weight={0:0.1, 1: 0.9}, n_jobs = -1)
params = {"criterion": ['gini', 'entropy'], "max_features": ['auto', 0.4]}
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)
model = GridSearchCV(estimator=forest, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
model.fit(X_train, y_train)

In [None]:
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)

In [None]:
y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

sensitivity, specificity, _ = sensitivity_specificity_support(y_test, y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t", round(specificity, 2), sep='')

y_pred_prob = model.predict_proba(X_test)[:, 1]
print("AUC:    \t", round(roc_auc_score(y_test, y_pred_prob),2))

In [None]:
#feature selection
max_features = int(round(np.sqrt(X_train.shape[1])))    # number of variables to consider to split each node
print(max_features)

rf_model = RandomForestClassifier(n_estimators=100, max_features=max_features, class_weight={0:0.1, 1: 0.9}, oob_score=True, random_state=4, verbose=1)

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
rf_model.oob_score_

In [None]:
y_pred = rf_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

sensitivity, specificity, _ = sensitivity_specificity_support(y_test, y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t", round(specificity, 2), sep='')

y_pred_prob = rf_model.predict_proba(X_test)[:, 1]
print("ROC:    \t", round(roc_auc_score(y_test, y_pred_prob),2))

In [None]:
#feature importance
# predictors
features = churn_filtered.drop('churn', axis=1).columns

# feature_importance
importance = rf_model.feature_importances_

# create dataframe
feature_importance = pd.DataFrame({'variables': features, 'importance_percentage': importance*100})
feature_importance = feature_importance[['variables', 'importance_percentage']]

# sort features
feature_importance = feature_importance.sort_values('importance_percentage', ascending=False).reset_index(drop=True)
print("Sum of importance=", feature_importance.importance_percentage.sum())
feature_importance

In [None]:
#top 30 features
top_n = 30
top_features = feature_importance.variables[0:top_n]

In [None]:
import seaborn as sns
plt.rcParams["figure.figsize"] =(10,10)
mycmap = sns.diverging_palette(199, 359, s=99, center="light", as_cmap=True)
sns.heatmap(data=X_train[top_features].corr(), center=0.0, cmap=mycmap)

In [None]:
top_features = ['total_ic_mou_8', 'total_rech_amt_diff', 'total_og_mou_8', 'arpu_8', 'roam_ic_mou_8', 'roam_og_mou_8', 
                'std_ic_mou_8', 'av_rech_amt_data_8', 'std_og_mou_8']
X_train = X_train[top_features]
X_test = X_test[top_features]

In [None]:
# logistic regression
steps = [('scaler', StandardScaler()), 
         ("logistic", LogisticRegression(class_weight={0:0.1, 1:0.9}))
        ]

logistic = Pipeline(steps)

params = {'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 'logistic__penalty': ['l1', 'l2']}

folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

model = GridSearchCV(estimator=logistic, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
model.fit(X_train, y_train)

In [None]:
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)

In [None]:
y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

sensitivity, specificity, _ = sensitivity_specificity_support(y_test, y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t", round(specificity, 2), sep='')

y_pred_prob = model.predict_proba(X_test)[:, 1]
print("ROC:    \t", round(roc_auc_score(y_test, y_pred_prob),2))

In [None]:
#intercepts and coefficients
logistic_model = model.best_estimator_.named_steps['logistic']

In [None]:
intercept_df = pd.DataFrame(logistic_model.intercept_.reshape((1,1)), columns = ['intercept'])

In [None]:
coefficients = logistic_model.coef_.reshape((9, 1)).tolist()
coefficients = [val for sublist in coefficients for val in sublist]
coefficients = [round(coefficient, 3) for coefficient in coefficients]
logistic_features = list(X_train.columns)
coefficients_df = pd.DataFrame(logistic_model.coef_, columns=logistic_features)

In [None]:
coefficients = pd.concat([intercept_df, coefficients_df], axis=1)
coefficients

# Insights 
1- STD and ISD are very important in deciding the churn.

2- Roaming rates are also one of the important factors contributing to churn ratios.

3- Customers that have been using services for less than 4 years have high chances of churning.