In [None]:
import numpy as np 
import pandas as pd
np.random.seed(42)
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
import seaborn as sns
import glob
pd.set_option('display.max_columns', None)
import missingno as msno
from sklearn.impute import MissingIndicator,SimpleImputer
from sklearn.preprocessing import RobustScaler,OneHotEncoder,LabelEncoder
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline,Pipeline,FeatureUnion,make_union
import imblearn
from imblearn.over_sampling import ADASYN, RandomOverSampler, SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN,SMOTETomek
import os
from bayes_opt import BayesianOptimization
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
import tensorflow as tf
from scipy.stats import chi2_contingency,ttest_ind
import category_encoders as CE

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# lets take a look at the data and their datatypes
df = pd.read_csv("/kaggle/input/datasets-for-churn-telecom/cell2celltrain.csv",index_col="CustomerID")
print(df.shape)

# data clean up
def cleanup(df):
    df = df.replace('Unknown',np.nan)
    df['CreditRating'] = df['CreditRating'].apply(lambda v : int(v.split('-')[0]) if v else np.nan)
    df['HandsetPrice'] = df['HandsetPrice'].astype('float')
    return df

df = cleanup(df)
df.head().append(df.dtypes.rename('dtypes'))

In [None]:
df.dtypes.value_counts()

In [None]:
# target variable distribution : shows the dataset is imbalanced. 
sns.countplot(x=df.Churn)

In [None]:
## lets visualize distribution of numeric variables with repect to Churn.
## I don't see any striking patterns though
plots_per_row = 5

number_of_plots = df.select_dtypes(exclude=['object']).shape[-1]

fig, axes = plt.subplots((number_of_plots//plots_per_row)+(number_of_plots%5!=0),plots_per_row) 

axes = axes.flatten()
i = 0
for index,col in df.select_dtypes(exclude=['object']).columns.to_series().items():    
    a = sns.histplot(data=df, x=col, hue="Churn", ax=axes[i],kde=False,bins=100)
    i+=1
    
for i in range(1,len(axes)-number_of_plots+1):
    fig.delaxes(axes[-i]) # remove empty subplot
# plt.tight_layout()

fig.set_figwidth(20)
fig.set_figheight(25)

plt.show()

In [None]:
# now lets look at categorical variables
df.select_dtypes(include=['object']).describe()

ServiceArea has high cardinality (747 unique values). We will test later to check if ServiceArea has any statistically significant relationship with Churn

In [None]:
## visualize distribution of categorical variables with respect to Churn
## I don't see any striking patterns here too
plots_per_row = 5

number_of_plots = df.select_dtypes(include=['object']).shape[-1]

fig, axes = plt.subplots((number_of_plots//plots_per_row)+(number_of_plots%5!=0),plots_per_row) 

axes = axes.flatten()
i = 0
for index,col in df.select_dtypes(include=['object']).columns.to_series().items():
    
    a = sns.countplot(x=df[col],hue=df.Churn,ax=axes[i])
    a.tick_params(axis='x', labelrotation= 90)
    i+=1
    
for i in range(1,len(axes)-number_of_plots+1):
    fig.delaxes(axes[-i]) # remove empty subplot
# plt.tight_layout()

fig.set_figwidth(20)
fig.set_figheight(25)

plt.show()

In [None]:
# lets look at count of null values in the dataset. Fortunately there are no nulls in the target variable

null_rows_selector = df.isnull().any(axis=1)
null_row_count = df[null_rows_selector].shape[0]

df_null = df.isnull().groupby(df.Churn).sum().transpose()
df_null['total'] = df.isnull().sum()
df_null['percent'] = (df_null['total']/len(df))*100
df_null = df_null[df_null.total!=0]

print("rows with null values:",null_row_count,", {:.2f}%".format((null_row_count/len(df))*100))
print('columns with null values:',df_null.shape[0])

df_null

In [None]:
# lets focus on rows and columns with nulls, we see that values are not missing at random
viz_null = df[null_rows_selector][df_null.index]
msno.matrix(viz_null)

In [None]:
# lets check if churn distribution is different within rows with null values. And looks like it may be different.
pd.concat([df.Churn.value_counts(normalize=True).rename("Overall"), df[null_rows_selector].Churn.value_counts(normalize=True).rename("within_null_rows")],axis=1)

In [None]:
# lets do a chi square independence test to see if the difference in distributions is statistically significant

contingency_table = pd.concat([df.Churn.value_counts().rename("Overall"), df[null_rows_selector].Churn.value_counts().rename("within_null_rows")],axis=1).transpose()
contingency_table

In [None]:
# chi square independence test
# Null Hypothesis HO: Distribution of Churn is independent of presence of null values


stat, p, dof, expected = chi2_contingency(contingency_table.values)
  
# interpret p-value
alpha = 0.05 # significance value for test
print("p value is " + str(p))

print('Dependent (reject H0)') if p <= alpha else print('Independent (H0 holds true)')

Presence of null values has a statistically significant effect on Churn. So I will try to include missing value indicator as additional feature while training

In [None]:
# Chi squared Independence test between categorical values and Churn. This will help to identify important variables on which Churn depends.
# Based on the test, we can recommend to drop/include them in while training
def chi2test(X,y,alpha=0.05):
    '''
        X : dataframe 
        y : series
    '''
    target = y.name
    print('ch2test with alpha',alpha)
    test_df = []
    for index,col in X.select_dtypes(include=['object']).columns.to_series().items():
        df = pd.concat([y,X[col]],axis=1)
        contingency_table = df.value_counts().rename("counts").reset_index().pivot(index=target,columns=col,values='counts').fillna(0)
        stat, p, dof, expected = chi2_contingency(contingency_table.values)
        test_df.append([target,col,stat,p,'Dependent (reject H0)' if p <= alpha else 'Independent (H0 holds true)','include' if p <= alpha else 'drop'])
        
    test_df = pd.DataFrame(test_df,columns=["variable1","variable2","chi2-stat","p-value","result","recommendation"])
    return test_df

chi2test(df.drop('Churn',axis=1),df['Churn'])

This test shows ServiceArea and Churn are dependent. So I will keep it for training but I will use the LeaveOneOut version of Target Encoder to deal with high cardinality of ServiceArea.

In [None]:
# t test to check if means of a numerical variable differ significantly if Churn is different. 

def t_test(X,y,alpha=0.05):   
    target = y.name
    print('t_test with alpha',alpha)
    test_df = []
    for index,col in X.select_dtypes(exclude=['object']).columns.to_series().items():
        df = pd.concat([y,X[col]],axis=1)
        ttest_df = df.set_index(target,drop=True).fillna(0)
        stat, p = ttest_ind(ttest_df.loc["Yes"],ttest_df.loc["No"],equal_var=False)
        test_df.append([target,col,stat,p,'Dependent (reject H0)' if p <= alpha else 'Independent (H0 holds true)','include' if p <= alpha else 'drop'])
        
    test_df = pd.DataFrame(test_df,columns=["variable1","variable2","t-stat","p-value","result","recommendation"])
    return test_df

t_test(df.drop('Churn',axis=1),df['Churn'])

In [None]:
# Lets set up the experiment code. We can try different models using this as common template
 
class Experiment:
    
    def __init__(self,model,data_transformer,pbounds=None):
        self.model = model
        self.data_transformer = data_transformer
        self.results = None
        self.pbounds = pbounds

    def evaluate(self,X,y):
        results = dict()
        predictions = self.model.predict(X)
        probas = self.model.predict_proba(X)
        results["report"] = metrics.classification_report(y, predictions,output_dict=True)
        results["roc"] = metrics.roc_auc_score(y,probas[:,1])
        return results
    
    def hyper_parameter_tuning(self,X_train,y_train,X_val,y_val):
            
        def opt_function(**kwargs):
            current_params = dict()
            for key in self.pbounds:
                current_params[key] = int(kwargs[key])
            self.model.set_params(**current_params)
            self.model.fit(X_train,y_train)
            result = self.evaluate(X_val,y_val)
            return result['roc']
        
        optimizer = BayesianOptimization(
            f=opt_function,
            pbounds=self.pbounds,
            random_state=42,
            verbose=2
        )
        
        optimizer.maximize(
            init_points=10,
            n_iter=10,
        )
        
        params = optimizer.max['params']
        for key in params:
            params[key] = int(params[key])
        print("optimal target",optimizer.max['target'])
        print(params)
        
        return params  
    
    def prep_data(self,df,target="Churn"):
        X_train,X_test,y_train,y_test = train_test_split(df.drop(target,axis=1),df[target],test_size=.2, random_state=42,stratify=df[target])
        X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size=.25,random_state=42,stratify=y_train)

        print(X_train.shape,y_train.shape,X_val.shape,y_val.shape,X_test.shape,y_test.shape)

        X_train,y_train = data_transformer.fit_transform(X_train,y_train)
        X_val,y_val = data_transformer.transform(X_val,y_val)
        X_test,y_test = data_transformer.transform(X_test,y_test)
        
        print(X_train.shape,y_train.shape,X_val.shape,y_val.shape,X_test.shape,y_test.shape)
        return X_train,y_train,X_val,y_val,X_test,y_test
    
    def __format_results__(self,results):
        r = pd.DataFrame(results) 
        g = pd.concat([pd.DataFrame(x) for x in r.loc['report']])
        g.index = pd.MultiIndex.from_product([["train_data","val_data","test_data"],['precision', 'recall', 'f1-score', 'support']],names=["dataset","metric"])
        return g.join(r.loc["roc"],on=["dataset"])
        
        
    def run(self,df,hptuning=False,**kwargs):
        
        X_train,y_train,X_val,y_val,X_test,y_test = self.prep_data(df) 
        
        if hptuning:
            params = self.hyper_parameter_tuning(X_train,y_train,X_val,y_val)
            self.model = model.set_params(**params)
            
        %time self.model.fit(X_train,y_train,**kwargs)
        
        results = dict()
        results["train_data"] = self.evaluate(X_train,y_train)
        results["val_data"] = self.evaluate(X_val,y_val)
        results["test_data"] = self.evaluate(X_test,y_test)
        
        self.results = self.__format_results__(results)
        
        return self.results
    

In [None]:
# Lets define the pre processing pipeline for data. I included missing value indicator,
# and feature selection using statistical tests.

class DataTransformer:

    def selectColumns(self, X, y):
        '''
            selects columns based on chi2test and t_test
        '''
        cols = []
        for func in [chi2test,t_test]:
            test_df = func(X,y)
            included_columns = test_df[test_df['recommendation']=='include']['variable2'].tolist()
            dropped_columns = test_df[test_df['recommendation']=='drop']['variable2'].tolist()

            print("dropped columns:",dropped_columns,len(dropped_columns))
            cols.extend(included_columns)
        return cols

    def __init__(self,missingIndicator=False,featureSelection=False):
        
        self.featureSelection = featureSelection
        self.selectedColumns = None # its value is initialized at time of fitting the pipeline
        
        cont_imputer = SimpleImputer(strategy='mean')
        cont_normalizer = RobustScaler() 
        cont_pipeline =  make_pipeline(cont_imputer,cont_normalizer)
        
        if missingIndicator:
            cont_missing_indicator = MissingIndicator(features='all')
            cont_pipeline = make_union(cont_pipeline,cont_missing_indicator)
        
        cat_imputer = SimpleImputer(strategy='constant',fill_value='Unknown')
        cat_encoder = CE.leave_one_out.LeaveOneOutEncoder(sigma=0.05)
        cat_pipeline = make_pipeline(cat_imputer,cat_encoder)
        
        if missingIndicator:
            cat_missing_indicator = MissingIndicator(features='all')
            cat_pipeline = make_union(cat_pipeline,cat_missing_indicator)

        cont_selector = make_column_selector(dtype_exclude='object')
        cat_selector = make_column_selector(dtype_include='object')
        cont_cat_split_transform = make_column_transformer((cont_pipeline, cont_selector), (cat_pipeline,cat_selector), remainder='drop')
        xpipe =  make_pipeline(cont_cat_split_transform)
            
        ypipe = LabelEncoder() 
        
        self.X_pipeline = xpipe
        self.y_pipeline = ypipe
        
    def fit_transform(self,X,y):
        if self.featureSelection:
            self.selectedColumns = self.selectColumns(X,y)
            X = X[self.selectedColumns]
        y = self.y_pipeline.fit_transform(y)
        X = self.X_pipeline.fit_transform(X,y)
        
        return (X,y)
    
    def transform(self,X,y):
        y = self.y_pipeline.transform(y)
        if self.featureSelection:
            X = X[self.selectedColumns]
        X = self.X_pipeline.transform(X)
        return (X,y)


In [None]:
%%time
# Lets start with a gradient boosting model

data_transformer = DataTransformer(missingIndicator=False,featureSelection=False)

model = GradientBoostingClassifier(random_state=42)
exp = Experiment(model,data_transformer)
exp.run(df,hptuning=False)


In [None]:
%%time

## lets see if we can get same/better results by dropping features that we tagged as insignificant using statistical tests
## 16 features were dropped. Results dropped slightly but training time reduced.
data_transformer = DataTransformer(missingIndicator=False,featureSelection=True)

model = GradientBoostingClassifier(random_state=42)
exp = Experiment(model,data_transformer)
exp.run(df,hptuning=False)



In [None]:
%%time
## Let's add missing value indicator and try. 
## we see that results don't change much.

data_transformer = DataTransformer(missingIndicator=True,featureSelection=True)

model = GradientBoostingClassifier(random_state=42)
exp = Experiment(model,data_transformer)
exp.run(df,hptuning=False)


In [None]:
%%time
# the evaluation results of class 1 are much worse than of class 0. This is due to imbalanced dataset
# lets try undersampling to deal with this.

# we see that the results of class 1 increased but that of class 0 dropped which caused a drop in overall results

data_transformer = DataTransformer(missingIndicator=True,featureSelection=True)

model = imblearn.pipeline.make_pipeline(RandomUnderSampler(random_state=42),GradientBoostingClassifier(random_state=42))
exp = Experiment(model,data_transformer)
exp.run(df,hptuning=False)

In [None]:
%%time
# lets tune the hyperparameters
# Now we have better results on class 1 and also slightly better overall results.
# these resuls
data_transformer = DataTransformer(missingIndicator=True,featureSelection=True)

model = imblearn.pipeline.make_pipeline(RandomUnderSampler(random_state=42),GradientBoostingClassifier(random_state=42))
exp = Experiment(model,data_transformer,{"gradientboostingclassifier__max_depth":(3,20),"gradientboostingclassifier__max_features":(3,40),"gradientboostingclassifier__max_leaf_nodes":(32,128)})

exp.run(df,hptuning=True)

In [None]:
%%time
# lets tune the hyperparameters
# Now we have better results on class 1 and also slightly better overall results.
# Please note these results may not be exactly reproducible due to an issue with BayesianOptimization package use for hyperparameter tuning. Setting the random state is not working correctly.
data_transformer = DataTransformer(missingIndicator=True,featureSelection=True)

model = imblearn.pipeline.make_pipeline(RandomUnderSampler(random_state=42),GradientBoostingClassifier(random_state=42))
exp = Experiment(model,data_transformer,{"gradientboostingclassifier__max_depth":(3,20),"gradientboostingclassifier__max_features":(3,40),"gradientboostingclassifier__max_leaf_nodes":(32,128)})

exp.run(df,hptuning=True)