## Telecom customer churn prediction

This data set consists of 100 variables and approx 100 thousand records. This data set contains different variables explaining the attributes of telecom industry and various factors considered important while dealing with customers of telecom industry. The target variable here is churn which explains whether the customer will churn or not. We can use this data set to predict the customers who would churn or who wouldn't churn depending on various variables available.

In [None]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats.mstats import winsorize

import re
import gc
import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")

%matplotlib inline

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 1000

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
data = pd.read_csv('../input/telecom-customer/Telecom_customer churn.csv')
# data = pd.read_csv('Telecom_customer churn.zip')
telco= data.copy()
telco.head()

In [None]:
features=pd.read_csv("../input/description/description.csv", index_col = 0)
features

## FUNCTIONS

#### Description Functions

In [None]:
def summary(df, pred=None):
    obs = df.shape[0]
    Types = df.dtypes
    Counts = df.apply(lambda x: x.count())
    Min = df.min()
    Max = df.max()
    Median = df.quantile(0.5)
    Mean = df.mean()
    Mode = df.mode().loc[0]
    Uniques = df.apply(lambda x: x.unique().shape[0])
    Nulls = df.apply(lambda x: x.isnull().sum())
    print('Data shape:', df.shape)

    if pred is None:
        cols = ['Types', 'Counts', 'Uniques', 'Nulls', 'Min', 'Max', 'Mean', 'Median','Mode']
        str = pd.concat([Types, Counts, Uniques, Nulls, Min, Max, Mean, Median, Mode], axis = 1, sort=True)

    str.columns = cols
    print('___________________________\nData Types:')
    print(str.Types.value_counts())
    print('___________________________')
    return str

In [None]:
!pip install colorama
def column_details(regex, df):
  # We will focus on each column in detail
  # Uniqe Values, DTYPE, NUNIQUE, NULL_RATE
  global columns
  columns=[col for col in df.columns if re.search(regex, col)]

  from colorama import Fore, Back, Style

  print('Unique Values of the Features:\nfeature: DTYPE, NUNIQUE, NULL_RATE\n')
  for i in df[columns]:
      color = Fore.RED if df[i].dtype =='float64' else Fore.BLUE if df[i].dtype =='int64' else Fore.GREEN
      print(f'{i}: {color} {df[i].dtype}, {df[i].nunique()}, %{round(df[i].isna().sum()/len(df[i])*100,2)}\n{Style.RESET_ALL}{pd.Series(df[i].unique()).sort_values().values}\n')
      

In [None]:
def null_values(df, rate=0):
    """a function to show null values with percentage"""
    nv=pd.concat([df.isnull().sum(), 100 * df.isnull().sum()/df.shape[0]],axis=1).rename(columns={0:'Missing_Records', 1:'Percentage (%)'})
    return nv[nv['Percentage (%)']>rate].sort_values('Percentage (%)', ascending=False)

#### Plot Functions

In [None]:
def labels(ax, df, xytext=(0, 0)):
    for bar in ax.patches: 
        ax.annotate('%{:.2f}\n{:.0f}'.format(100*bar.get_height()/len(df),bar.get_height()), (bar.get_x() + bar.get_width() / 2,  
                    bar.get_height()), ha='center', va='center', 
                    size=11, xytext=xytext, 
                    textcoords='offset points')

def plot_col(col, df, target='Churn', figsize=(20,6)):

    fig, ax = plt.subplots(1,2,figsize=figsize, sharey=True)

    plt.subplot(121)
    tmp = pd.crosstab(df[col], df[target], normalize='index') * 100
    tmp = tmp.reset_index()
    tmp.rename(columns={0:'NotChurn', 1:'Churn'}, inplace=True)

    ax[0] = sns.countplot(x=col, data=df, hue=target, 
                  order=np.sort(df[col].dropna().unique()),
                  )
    ax[0].tick_params(axis='x', rotation=90)
    labels(ax[0],df[col].dropna(),(0, 0))
    
    ax_twin = ax[0].twinx()
    # sns.set(rc={"lines.linewidth": 0.7})
    ax_twin = sns.pointplot(x=col, y='Churn', data=tmp, color='black', legend=False, 
                  order = np.sort(df[col].dropna().unique()), 
                  linewidth=0.1)
    

    ax[0].grid()

    plt.subplot(122)
    ax[1] = sns.countplot(x=df[col].dropna(),
                  order= np.sort(df[col].dropna().unique()),
                  )
    ax[1].tick_params(axis='x', rotation=90)
    labels(ax[1],df[col].dropna())
    plt.show()


In [None]:
def plot_cols(regex, figsize, target, df):
  columns=[col for col in df.columns if re.search(regex, col)]
  nrow, ncolumn = len(columns),1

  fig, ax = plt.subplots(nrow, ncolumn,figsize=figsize)

  for i,col in enumerate(columns):
      order = np.sort(df[col].dropna().unique())
      
      plt.subplot(nrow,ncolumn,i+1)
      ax[i] = sns.countplot(x=df[col], data= df, hue=target, order = order)
  #     labels(ax[i],df[col].dropna(),(0,0))
      
      tmp = pd.crosstab(df[col], df[target], normalize='index') * 100
      tmp = tmp.reset_index()
      tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)
      
      ax_twin = ax[i].twinx()
  #     sns.set(rc={"lines.linewidth": 1})
      ax_twin = sns.pointplot(x=tmp[col], y=tmp['Fraud'],color='black', order = order)
      ax[i].grid();

In [None]:
def hist_countplot(regex, figsize, nrow=10, ncolumn = 4, target='isFraud', df=pd.DataFrame()):
  plt.figure(figsize=figsize)
  columns=[col for col in df.columns if re.search(regex, col)]

  for i,col in enumerate(columns):
      plt.subplot(nrow,ncolumn,i+1)
      if df[col].dtype!='O':
          sns.histplot(x=df[col], data= df, hue=target)
      else:
          sns.countplot(x=df[col], data= df, hue=target) 

In [None]:
def box_countplot(regex, figsize, nrow=10, ncolumn = 4, target='isFraud', df=pd.DataFrame()):
  plt.figure(figsize=figsize)

  columns=[col for col in df.columns if re.search(regex, col)]

  for i,col in enumerate(columns):
      plt.subplot(nrow,ncolumn,i+1)
      if df[col].dtype!='O':
          sns.boxplot(y=df[col], data= df, x=target)
      else:
          sns.countplot(x=df[col])

In [None]:
def box_labels(ax, df, col1,col2):
    medians = df.groupby([col1])[col2].median().round(2)
    vertical_offset = df[col2].median() * 0.05 # offset from median for display

    for xtick in ax.get_xticks():
        ax.text(xtick,medians[xtick] + vertical_offset,medians[xtick], 
                horizontalalignment='center',size='small',color='w',weight='semibold')

In [None]:
def stripplot(regex, figsize, nrow=10, ncolumn = 4, target='churn', df=pd.DataFrame()):
  plt.figure(figsize=figsize)

  columns=[col for col in df.columns if re.search(regex, col)]

  for i,col in enumerate(columns):
      plt.subplot(nrow,ncolumn,i+1)
      sns.stripplot(y=df[col], data= df, x=target)

#### Correlation Functions

In [None]:
# Remove the highly collinear features from data
def remove_collinear_features(x, threshold):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold. Removing collinear features can help a model 
        to generalize and improves the interpretability of the model.

    Inputs: 
        x: features dataframe
        threshold: features with correlations greater than this value are removed

    Output: 
        dataframe that contains only the non-highly-collinear features
    '''

    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
#                 print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    x = x.drop(columns=drops)

    return drops

In [None]:
def corrank(X, threshold=0):
    import itertools
    df = pd.DataFrame([[i,j,X.corr().abs().loc[i,j]] for i,j in list(itertools.combinations(X.corr().abs(), 2))],columns=['Feature1','Feature2','corr'])    
    df = df.sort_values(by='corr',ascending=False).reset_index(drop=True)
    return df[df['corr']>threshold]

In [None]:
import scipy.stats as sts

# References:
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
# https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

def cramers_v(x, y):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    confusion_matrix = pd.crosstab(x,y)
    chi2 = sts.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

#### Outlier Functions

In [None]:
def outliers(s):
  # summary of the outliers
  iqr = (np.quantile(s, 0.75))-(np.quantile(s, 0.25))
  upper_bound = np.quantile(s, 0.75)+(1.5*iqr)
  lower_bound = np.quantile(s, 0.25)-(1.5*iqr)
  f = []
  for i in s:
      if i > upper_bound:
          f.append(i)
      elif i < lower_bound:
          f.append(i)
  sums = len(f)
  pros = len(f)/len(s)*100
  d = {'IQR':iqr,
        'Upper Bound':upper_bound,
      'Lower Bound':lower_bound,
      'Sum outliers': sums,'percentage outliers':pros}
  d = pd.DataFrame(d.items(),columns = ['sub','values'])
  return(d)


In [None]:
def capping_outliers(col, whisker=1.5):
  # replace outliers with upper_bound and lower_bound values

  iqr = (np.quantile(df[col], 0.75))-(np.quantile(df[col], 0.25))
  upper_bound = np.quantile(df[col], 0.75)+(whisker*iqr)
  lower_bound = np.quantile(df[col], 0.25)-(whisker*iqr)

  df[col] = np.where(df[col] < upper_bound, 
                                upper_bound,df[col])
  df[col] = np.where(df[col] > lower_bound, 
                                lower_bound,df[col])
  plt.figure(figsize=(12,6))
  sns.boxplot(x= df[col]);
  return df[col]

In [None]:
def replace_outliers(col, replaced_value='median',whisker=1.5):
  # replace outliers with 'median','mean','mode' or an assigned value.

  iqr = (np.quantile(df[col], 0.75))-(np.quantile(df[col], 0.25))
  upper_bound = np.quantile(df[col], 0.75)+(whisker*iqr)
  lower_bound = np.quantile(df[col], 0.25)-(whisker*iqr)
  if replaced_value=='median':
    df[col] = df[col].mask(df[col] > upper_bound, df[col].median())
    df[col] = df[col].mask(df[col] < lower_bound, df[col].median())
  elif replaced_value=='mean':
    df[col] = df[col].mask(df[col] > upper_bound, df[col].mean())
    df[col] = df[col].mask(df[col] < lower_bound, df[col].mean())
  elif replaced_value=='mode':
    df[col] = df[col].mask(df[col] > upper_bound, df[col].mode()[0])
    df[col] = df[col].mask(df[col] < lower_bound, df[col].mode()[0])
  else:
    df[col] = df[col].mask(df[col] > upper_bound, replaced_value)
    df[col] = df[col].mask(df[col] < lower_bound, replaced_value)   

  plt.figure(figsize=(12,6))
  sns.boxplot(x= df[col]);

  return df[col]

In [None]:
def col_plot(df,col_name):
    plt.figure(figsize=(15,6))
    
    plt.subplot(141) # 1 satir x 4 sutun dan olusan ax in 1. sutununda calis
    plt.hist(df[col_name], bins = 20)
    f_sqrt=lambda x:(np.sqrt(x) if x>1 else -np.sqrt(-x) if x<-1 else x)
    f_log=lambda x:(np.log(x)+1 if x>1 else -np.log(-x)-1 if x<-1 else x)
    
    # üç sigma aralikta(verinin %99.7 sini icine almasi beklenen bolum) iki kirmizi cizgi arasinda
    plt.axvline(x=df[col_name].mean() + 3*df[col_name].std(),color='red')
    plt.axvline(x=df[col_name].mean() - 3*df[col_name].std(),color='red')
    plt.xlabel(col_name)
    plt.tight_layout
    plt.xlabel("Histogram ±3z")
    plt.ylabel(col_name)

    plt.subplot(142)
    plt.boxplot(df[col_name]) # IQR katsayisi, defaultu 1.5
    plt.xlabel("IQR=1.5")

    plt.subplot(143)
    plt.boxplot(df[col_name].apply(f_sqrt), whis = 1.5)
    plt.xlabel("ROOT SQUARE - IQR=1.5")

    plt.subplot(144)
    plt.boxplot(df[col_name].apply(f_log), whis = 1.5)
    plt.xlabel("LOGARITMIC - IQR=1.5")
    plt.show()

In [None]:
def plot_winsorize(df,col_name,down=0, up=0.1):
    plt.figure(figsize = (15, 6))

    winsor=winsorize(df[col_name], (down,up))
    f_sqrt=lambda x:(np.sqrt(x) if x>1 else -np.sqrt(-x) if x<-1 else x)
    root_winsor=winsorize(df[col_name].apply(f_sqrt), (down,up))

    plt.subplot(141)
    plt.hist(winsor, bins = 22)
    plt.axvline(x=winsor.mean()+3*winsor.std(),color='red')
    plt.axvline(x=winsor.mean()-3*winsor.std(),color='red')
    plt.xlabel('Winsorize_Histogram')
    plt.ylabel(col_name)
    plt.tight_layout

    plt.subplot(142)
    plt.boxplot(winsor, whis = 1.5)
    plt.xlabel('Winsorize - IQR:1.5')
    
    plt.subplot(143)
    plt.hist(root_winsor, bins=22)
    plt.axvline(x=root_winsor.mean()+3*root_winsor.std(),color='red')
    plt.axvline(x=root_winsor.mean()-3*root_winsor.std(),color='red')
    plt.xlabel('root_winsor_col_name')

    plt.subplot(144)
    plt.boxplot(root_winsor, whis = 1.5)
    plt.xlabel("Root & Winsorize - IQR=1.5")
    plt.show() 

In [None]:
def plot_log_winsorize(df,col_name,up=0.1,down=0):
    plt.figure(figsize = (15, 6))

    winsor=winsorize(df[col_name], (down,up))
    f_log=lambda x:(np.log(x)+1 if x>1 else -np.log(-x)-1 if x<-1 else x)
    log_winsor=winsorize(df[col_name].apply(f_log), (down,up))

    plt.subplot(141)
    plt.hist(winsor, bins = 22)
    plt.axvline(x=winsor.mean()+3*winsor.std(),color='red')
    plt.axvline(x=winsor.mean()-3*winsor.std(),color='red')
    plt.xlabel('Winsorize_Histogram')
    plt.ylabel(col_name)
    plt.tight_layout

    plt.subplot(142)
    plt.boxplot(winsor, whis = 1.5)
    plt.xlabel('Winsorize - IQR:1.5')
    
    plt.subplot(143)
    plt.hist(log_winsor, bins=22)
    plt.axvline(x=log_winsor.mean()+3*log_winsor.std(),color='red')
    plt.axvline(x=log_winsor.mean()-3*log_winsor.std(),color='red')
    plt.xlabel('log_winsor_col_name')

    plt.subplot(144)
    plt.boxplot(log_winsor, whis = 1.5)
    plt.xlabel("Log & Winsorize - IQR=1.5")
    plt.show()

In [None]:
def simplify_column(col, df, threshold=0.005, value='mode'):
  df[col] = df[col].replace(df[col].value_counts(dropna=True)[df[col].value_counts(dropna=True, normalize=True)<threshold].index,df[col].mode()[0] if value=='mode' else 'other')
  return df[col]

### Memory Reduction Functions

In [None]:
# Memory Reduction
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

### Encoders

In [None]:
# Label Encoding

from sklearn.preprocessing import LabelEncoder
# def label_encoder(df):
#   for col in df.columns:
#     if df[col].dtype=='object':
#       le = LabelEncoder()
#       df[col] = le.fit_transform(df[col])
#   return df


# from sklearn.preprocessing import LabelEncoder
def label_encoder(cat_cols, df):
  for col in cat_cols:
    # if df[col].dtype=='object':
    if col in df.columns:
      le = LabelEncoder()
      # le.fit(list(df[col].astype(str).values))
      df[col] = le.fit_transform(list(df[col].astype(str).values))
  return df

In [None]:
# Frequency Encoding

def frequency_encoder(cat_cols, df):
  for col in cat_cols:
    if col in df.columns:
      df= df.join(df[col].map(df[col].value_counts(normalize=True)).to_frame().add_suffix('_freq'))
  return df

In [None]:
# Frequency Encoding

def frequency_encoder(cat_cols, df):
  for col in cat_cols:
    if col in df.columns:
      df= df.join(df[col].map(df[col].value_counts(normalize=True)).to_frame().add_suffix('_freq'))
  return df

### Modeling

In [None]:
def plot_feature_importances(model, num=10, figsize=(20,10)):
  feature_imp = pd.Series(model.feature_importances_,index=X.columns).sort_values(ascending=False)[:num]
  plt.figure(figsize=figsize)
  sns.barplot(x=feature_imp, y=feature_imp.index)
  plt.title("Feature Importance")
  plt.show()

## EDA

### Understanding Data

In [None]:
telco.info()

In [None]:
null_values(telco)

In [None]:
telco.columns

In [None]:
telco = telco.drop('Customer_ID', axis=1)

In [None]:
cat_cols = [col for col in telco.columns if telco[col].dtype=='object']

In [None]:
columns=[]
column_details(regex='', df=telco[cat_cols])

### Feature Engineering

In [None]:
# # Total number of kids of a customer
# kid_cols = ['kid0_2', 'kid3_5', 'kid6_10', 'kid11_15', 'kid16_17']
# telco['total_kid']=telco[kid_cols].replace({'Y':1,'U':0}).apply(sum, axis=1)

# # Average number of blocked (failed) voice calls / Average number of voice call attempts made 
# telco['vce_blk_rate'] = (telco['blck_vce_Mean'] / telco['plcd_vce_Mean']).fillna(0)

# # Average number of dropped (failed) voice calls / Average number of voice call attempts made
# telco['vce_drp_rate'] = (telco['drop_vce_Mean'] / telco['plcd_vce_Mean']).fillna(0)

# # Average number of blocked (failed) data calls / Average number of data call attempts made
# telco['dat_blk_rate'] = (telco['blck_dat_Mean'] / telco['plcd_dat_Mean']).fillna(0)

# # Average number of dropped (failed) data calls / Average number of data call attempts made
# telco['dat_drp_rate'] = (telco['drop_dat_Mean'] / telco['plcd_dat_Mean']).fillna(0)

# # Average number of completed voice calls / Average number of voice call attempts made 
# telco['vce_cmpt_rate'] = (telco['comp_vce_Mean'] / telco['plcd_vce_Mean']).fillna(0)

# # Average number of completed data calls / Average number of data call attempts made
# telco['dat_cmpt_rate'] = (telco['comp_dat_Mean'] / telco['plcd_dat_Mean']).fillna(0)

# # Average number of completed searches / Average number of attempted calls
# telco['tot_cmpt_rate'] = (telco['complete_Mean'] / telco['attempt_Mean']).fillna(0)

# # Average number of dropped or blocked calls / Average number of attempted calls
# telco['tot_drp_blk_rate'] = (telco['drop_blk_Mean'] / telco['attempt_Mean']).fillna(0)

# # Average number of voice call attempts made  / Average number of voice and data call attempts made
# telco['vce_dat_ratio'] = (telco['plcd_vce_Mean'] /  (telco['plcd_vce_Mean'] + telco['plcd_dat_Mean'])).fillna(0)

# # (Average monthly usage minutes in the previous three months - Average monthly usage minutes over the customer's lifetime)  / Average monthly usage minutes over the customer's lifetime
# telco['diff_3mon_overall_mou'] = ((telco['avg3mou'] - telco['avgmou']) / telco['avgmou']).fillna(0)

# # (Average monthly searches over the previous three months - Average monthly calls over the customer's lifetime)  / Average monthly calls over the customer's lifetime
# telco['diff_3mon_overall_qty'] = ((telco['avg3qty'] - telco['avgqty']) / telco['avgqty']).fillna(0)

# # (Average monthly income over the previous three months - Average monthly income over the customer's lifetime)  / Average monthly income over the customer's lifetime
# telco['diff_3mon_overall_rev'] = ((telco['avg3rev'] - telco['avgrev']) / telco['avgrev']).fillna(0)

# # (Average monthly usage minutes in the previous six months - Average monthly usage minutes over the customer's lifetime)  / Average monthly usage minutes over the customer's lifetime
# telco['diff_6mon_overall_mou'] = ((telco['avg6mou'] - telco['avgmou']) / telco['avgmou']).fillna(0)

# # (Average monthly searches over the previous six months - Average monthly calls over the customer's lifetime)  / Average monthly calls over the customer's lifetime
# telco['diff_6mon_overall_qty'] = ((telco['avg6qty'] - telco['avgqty']) / telco['avgqty']).fillna(0)

# # (Average monthly income over the previous six months - Average monthly income over the customer's lifetime)  / Average monthly income over the customer's lifetime
# telco['diff_6mon_overall_rev'] = ((telco['avg6rev'] - telco['avgrev']) / telco['avgrev']).fillna(0)

# # Number of missing values in every observation
# telco['total_nulls'] = data.isnull().sum(axis=1)

# # analog to digital transformation
# telco['eqpdays_digitized'] = np.digitize(telco['eqpdays'], bins=list(range(-30,901,30))+[telco['eqpdays'].max()+1])

# telco.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
cols_FE = ['vce_blk_rate','vce_drp_rate','dat_blk_rate','dat_drp_rate','vce_cmpt_rate','dat_cmpt_rate','tot_cmpt_rate',
          'tot_drp_blk_rate','vce_dat_ratio','diff_3mon_overall_mou','diff_3mon_overall_qty','diff_3mon_overall_rev',
          'diff_6mon_overall_mou','diff_6mon_overall_qty','diff_6mon_overall_rev']

In [None]:
num_cols = [col for col in telco.columns if telco[col].dtype!='object']
summary(telco[num_cols])

### Multicolliniarity

In [None]:
import missingno as msno
msno.matrix(telco.sample(200));

In [None]:
drop_col = remove_collinear_features(telco[num_cols], 0.9)
print(drop_col)

In [None]:
telco = telco.drop(drop_col, axis=1)

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(telco.corr(), cmap='coolwarm',annot=False);

In [None]:
cramers_v(telco.dwlltype,telco.dwllsize)

In [None]:
summary(telco[['dwlltype','dwllsize']])

In [None]:
# plot_col('dwlltype', df=telco, target='churn')
# plot_col('dwllsize', df=telco, target='churn')

In [None]:
telco = telco.drop('dwlltype', axis=1)

### Frequency Encoding

In [None]:
cat_cols = [col for col in telco.columns if telco[col].dtype=='object']
num_cols = [col for col in telco.columns if telco[col].dtype!='object']

In [None]:
telco[cat_cols].describe(include=['O']).T

In [None]:
telco = frequency_encoder(cat_cols, telco)
telco = telco.drop(cat_cols, axis=1)

In [None]:
telco.shape

### Multivariate Imputation

In [None]:
import gc

telco = reduce_mem_usage(telco)
gc.collect()

In [None]:
# Iterative Imputer default=BayesianRidge()

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

idf = telco.copy()

imp_median = IterativeImputer(missing_values=np.nan, initial_strategy='median', random_state=42)
df_imputed_bayesian = pd.DataFrame(imp_median.fit_transform(idf), index=idf.index, columns=idf.columns)
null_values(df_imputed_bayesian)

In [None]:
df_imputed_bayesian.to_pickle('./clean_dataset_11a.pkl')

In [None]:
telco = df_imputed_bayesian.copy()

### PCA

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# PCA_columns = [col for col in cols_FE if col in telco.columns]
# sc = MinMaxScaler()
# telco[PCA_columns] = sc.fit_transform(telco[PCA_columns])

In [None]:
# plt.figure(figsize=(30,6))
# pca = PCA().fit(telco[PCA_columns])
# x = range(1,len(PCA_columns)+1)
# plt.plot(x,np.cumsum(pca.explained_variance_ratio_), "bo-")
# plt.xlabel("Component Count")
# plt.ylabel("Variance Ratio")
# plt.xticks(range(1,telco[PCA_columns].shape[1]+1))
# plt.grid()
# plt.show()

In [None]:
# pca = PCA(n_components = 3)
# pca.fit(telco[PCA_columns])
# pca_telco = pca.transform(telco[PCA_columns])

# np.cumsum(pca.explained_variance_ratio_)

In [None]:
# pca_telco = pd.DataFrame(data = pca_telco).add_prefix('pca_')
# telco = pd.concat([telco, pca_telco], ignore_index=False, sort=False, axis=1)
# telco.drop(PCA_columns, axis=1, inplace=True)

In [None]:
telco.to_pickle('./clean_dataset_11b.pkl')

### Handling Outliers

In [None]:
outliers_cols= ['drop_dat_Mean','blck_dat_Mean', 
               'unan_dat_Mean', 'plcd_dat_Mean','recv_sms_Mean', 
               'mou_cdat_Mean','mou_pead_Mean','callfwdv_Mean','churn']

stripplot('', figsize=(25,70), nrow=23, ncolumn = 4, target='churn', df=telco[outliers_cols])

In [None]:
outliers_cols.remove('churn')
for col in outliers_cols:
  telco[col] = telco[col].replace({telco[col].max():telco[col].median()})
  telco[col] = telco[col].replace({telco[col].max():telco[col].median()})
  telco[col] = telco[col].replace({telco[col].max():telco[col].median()})

In [None]:
telco.to_pickle('./clean_dataset_11c.pkl')

##RFECV

In [None]:
# telco= pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/Telecom_customer/clean_dataset_11c.pkl')

In [None]:
import pandas as pd
import numpy as np
from numpy import percentile
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, scale, LabelEncoder
from sklearn.decomposition import PCA

from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import confusion_matrix, roc_curve, classification_report, plot_confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import multiprocessing
from IPython.core.pylabtools import figsize
font_title = {'family': 'times new roman', 'color': 'darkred', 
              'weight': 'bold', 'size': 14}
import warnings
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")

plt.rcParams['figure.dpi'] = 100

In [None]:
lgb = LGBMClassifier(learning_rate= 0.05, 
                    max_depth= 12, 
                    n_estimators= 1000, 
                    subsample= 0.1)

rfe = RFECV(estimator=lgb, step=10, cv=KFold(n_splits=5, shuffle=False), scoring='accuracy', verbose=2)

X = telco.drop(['churn'], axis=1)
y = telco['churn']

rfe.fit(X, y)

In [None]:
print('Optimal number of features:', rfe.n_features_)

In [None]:
plt.figure(figsize=(6,4))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
plt.show()

In [None]:
rfecv_cols = [col for col in X.columns[rfe.ranking_ == 1]]
len(rfecv_cols)

In [None]:
rfecv_cols+=['churn']

In [None]:
telco[rfecv_cols].to_pickle('./clean_dataset_11d.pkl')

## Building Models

In [None]:
telco= pd.read_pickle('./clean_dataset_11d.pkl')

In [None]:
print("Percentage of Churned Customer:%",
      round(telco.churn.mean(),5))

In [None]:
X = telco.drop(['churn'], axis=1)
y = telco['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state =42)

In [None]:
sc = RobustScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc= sc.transform(X_test)

In [None]:
cv_acc_train = {}
cv_acc_test = {}
cv_precision = {}
cv_recall = {}
cv_fallout = {}
cv_f1 = {}
cv_AUC = {}

In [None]:
def plot_result(model, name:str):
  global X_train, X_test, y_train, y_test
  if name=='lr':
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test= sc.transform(X_test)

  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  # Evaluation based on a 10-fold cross-validation
  scores = cross_validate(model, X_test, y_test, 
                        scoring=['balanced_accuracy','precision','recall','f1','roc_auc'], cv=10)

  df_scores = pd.DataFrame(scores, index = range(1,11))
  cv_acc_train[name] = cross_val_score(model, X_train, y_train, cv=10, scoring = 'balanced_accuracy').mean()
  cv_acc_test[name] = df_scores.mean()[2:].iloc[0]
  cv_precision[name] = df_scores.mean()[2:].iloc[1]
  cv_recall[name] = df_scores.mean()[2:].iloc[2]
  cv_fallout[name] = (confusion_matrix(y_test, y_pred)[0][1]/confusion_matrix(y_test, y_pred)[0].sum())
  cv_f1[name] = df_scores.mean()[2:].iloc[3]
  cv_AUC[name] = df_scores.mean()[2:].iloc[4]

  # accuracy scores
  print('Average Balanced Accuracy (CV=10), Test Set:', cv_acc_test[name])  
  print('Average Balanced Accuracy (CV=10), Training Set: ', cv_acc_train[name])

  # print classification report
  print(classification_report(y_test, y_pred, zero_division=0))

  # Plot Confusion Matrix
  plot_confusion_matrix(model, X_test, y_test, values_format='d')
  plt.grid(False)
  plt.show()

def get_metrics():
  df_eval = pd.DataFrame(data={'model': list(cv_acc_test.keys()), 
                              'bal_acc_train':list(cv_acc_train.values()),
                              'bal_acc_test': list(cv_acc_test.values()), 
                              'precision': list(cv_precision.values()), 
                              'recall': list(cv_recall.values()), 
                              'fallout':list(cv_fallout.values()), 
                              'f1': list(cv_f1.values()), 
                              'AUC': list(cv_AUC.values())}).round(3)
  return df_eval

### LightGBM Classifier

In [None]:
from lightgbm import LGBMClassifier

In [None]:
# lgb_params = {"n_estimators": [500,1000],
#              "subsample":[0.1],
#              "max_depth":[12,15],
#              "learning_rate":[0.1,0.05]}

In [None]:
# lgb_grid= GridSearchCV(lgb, lgb_params, cv = 5, 
#                             n_jobs = -1, verbose = 2).fit(X_train, y_train)

In [None]:
# lgb_grid.best_params_

In [None]:
lgb = LGBMClassifier(learning_rate= 0.05, 
                    max_depth= 12, 
                    n_estimators= 1000, 
                    subsample= 0.1)

plot_result(lgb, "lgb")

In [None]:
# Cross Validation Kfold=10
get_metrics()

### XGBoost Classifer

In [None]:
from xgboost import XGBClassifier
xgb= XGBClassifier()
plot_result(xgb, "xgb")

In [None]:
get_metrics()

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
plot_result(rfc, "rfc")

In [None]:
get_metrics()

### Logistic Regression

In [None]:
lr=LogisticRegression()
plot_result(lr, "lr")

In [None]:
get_metrics()

### Feature Importance for XGBoost

In [None]:
feature_imp = pd.Series(xgb.feature_importances_,
                        index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(18,12))
sns.barplot(x=feature_imp[:50], y=feature_imp[:50].index)
plt.title("Feature Importance")
plt.show()

### Feature Importance for Random Forest

In [None]:
feature_imp = pd.Series(rfc.feature_importances_,
                        index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(18,12))
sns.barplot(x=feature_imp[:50], y=feature_imp[:50].index)
plt.title("Feature Importance")
plt.show()

### Feature Importance for LightGBM

In [None]:
feature_imp = pd.Series(lgb.feature_importances_,
                        index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(18,12))
sns.barplot(x=feature_imp[:50], y=feature_imp[:50].index)
plt.title("Feature Importance")
plt.show()

## Compare Models

In [None]:
df_eval = get_metrics()
df_eval

In [None]:
def labels(ax):
    for p in ax.patches:
        width = p.get_width()    # get bar length
        ax.text(width,       # set the text at 1 unit right of the bar
                p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
                '{:1.3f}'.format(width), # set variable to display, 2 decimals
                ha = 'left',   # horizontal alignment
                va = 'center')  # vertical alignment

for i,col in enumerate(['bal_acc_test','recall','f1','AUC']):
  plt.subplot(4,1,i+1)
  ax = sns.barplot(x=col, y="model", data=df_eval.sort_values(by=col, ascending=False), palette="Blues_d")
  labels(ax)
  plt.show()

In [None]:
sns.relplot(x="f1", y="AUC", hue="model", size="bal_acc_test", sizes=(40, 400), 
            alpha=1, palette="bright", height=4, legend='full', data=get_metrics());

LightGBM is the best model. Lets save it.

### Saving Model

In [None]:
import pickle
import pandas as pd
pickle.dump(lgb,open("./LightGBM.pkl","wb"), protocol=4)