In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_rows', 200)

In [3]:
df_churn = pd.read_csv('telecom_churn_data.csv')
df_churn.head()

Unnamed: 0,mobile_number,circle_id,loc_og_t2o_mou,std_og_t2o_mou,loc_ic_t2o_mou,last_date_of_month_6,last_date_of_month_7,last_date_of_month_8,last_date_of_month_9,arpu_6,...,sachet_3g_9,fb_user_6,fb_user_7,fb_user_8,fb_user_9,aon,aug_vbc_3g,jul_vbc_3g,jun_vbc_3g,sep_vbc_3g
0,7000842753,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,197.385,...,0,1.0,1.0,1.0,,968,30.4,0.0,101.2,3.58
1,7001865778,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,34.047,...,0,,1.0,1.0,,1006,0.0,0.0,0.0,0.0
2,7001625959,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,167.69,...,0,,,,1.0,1103,0.0,0.0,4.17,0.0
3,7001204172,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,221.338,...,0,,,,,2491,0.0,0.0,0.0,0.0
4,7000142493,109,0.0,0.0,0.0,6/30/2014,7/31/2014,8/31/2014,9/30/2014,261.636,...,0,0.0,,,,1526,0.0,0.0,0.0,0.0


In [4]:
df_churn.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 226 columns):
mobile_number               99999 non-null int64
circle_id                   99999 non-null int64
loc_og_t2o_mou              98981 non-null float64
std_og_t2o_mou              98981 non-null float64
loc_ic_t2o_mou              98981 non-null float64
last_date_of_month_6        99999 non-null object
last_date_of_month_7        99398 non-null object
last_date_of_month_8        98899 non-null object
last_date_of_month_9        98340 non-null object
arpu_6                      99999 non-null float64
arpu_7                      99999 non-null float64
arpu_8                      99999 non-null float64
arpu_9                      99999 non-null float64
onnet_mou_6                 96062 non-null float64
onnet_mou_7                 96140 non-null float64
onnet_mou_8                 94621 non-null float64
onnet_mou_9                 92254 non-null float64
offnet_mou_6                960

## Data Cleaning


In [5]:
def calc_null_per():
    null_per = 100 * df_churn.isnull().sum() / df_churn.shape[0]
    print(null_per[null_per > 0])

In [6]:
calc_null_per()

loc_og_t2o_mou               1.018010
std_og_t2o_mou               1.018010
loc_ic_t2o_mou               1.018010
last_date_of_month_7         0.601006
last_date_of_month_8         1.100011
last_date_of_month_9         1.659017
onnet_mou_6                  3.937039
onnet_mou_7                  3.859039
onnet_mou_8                  5.378054
onnet_mou_9                  7.745077
offnet_mou_6                 3.937039
offnet_mou_7                 3.859039
offnet_mou_8                 5.378054
offnet_mou_9                 7.745077
roam_ic_mou_6                3.937039
roam_ic_mou_7                3.859039
roam_ic_mou_8                5.378054
roam_ic_mou_9                7.745077
roam_og_mou_6                3.937039
roam_og_mou_7                3.859039
roam_og_mou_8                5.378054
roam_og_mou_9                7.745077
loc_og_t2t_mou_6             3.937039
loc_og_t2t_mou_7             3.859039
loc_og_t2t_mou_8             5.378054
loc_og_t2t_mou_9             7.745077
loc_og_t2m_m

### Imputation

In [7]:
def impute_zero_based_on_col(col1, col2):
    df_churn[col1] = df_churn.apply(
    lambda x : 0 if (pd.isnull(x[col1]) & pd.isnull(x[col2])) else  x[col1],
    axis =1)

In [8]:
impute_zero_based_on_col('total_rech_data_6', 'date_of_last_rech_data_6')
impute_zero_based_on_col('total_rech_data_7', 'date_of_last_rech_data_7')
impute_zero_based_on_col('total_rech_data_8', 'date_of_last_rech_data_8')
impute_zero_based_on_col('total_rech_data_9', 'date_of_last_rech_data_9')

impute_zero_based_on_col('max_rech_data_6', 'date_of_last_rech_data_6')
impute_zero_based_on_col('max_rech_data_7', 'date_of_last_rech_data_7')
impute_zero_based_on_col('max_rech_data_8', 'date_of_last_rech_data_8')
impute_zero_based_on_col('max_rech_data_9', 'date_of_last_rech_data_9')

impute_zero_based_on_col('count_rech_2g_6', 'date_of_last_rech_data_6')
impute_zero_based_on_col('count_rech_2g_7', 'date_of_last_rech_data_7')
impute_zero_based_on_col('count_rech_2g_8', 'date_of_last_rech_data_8')
impute_zero_based_on_col('count_rech_2g_9', 'date_of_last_rech_data_9')

impute_zero_based_on_col('count_rech_3g_6', 'date_of_last_rech_data_6')
impute_zero_based_on_col('count_rech_3g_7', 'date_of_last_rech_data_7')
impute_zero_based_on_col('count_rech_3g_8', 'date_of_last_rech_data_8')
impute_zero_based_on_col('count_rech_3g_9', 'date_of_last_rech_data_9')

impute_zero_based_on_col('av_rech_amt_data_6', 'date_of_last_rech_data_6')
impute_zero_based_on_col('av_rech_amt_data_7', 'date_of_last_rech_data_7')
impute_zero_based_on_col('av_rech_amt_data_8', 'date_of_last_rech_data_8')
impute_zero_based_on_col('av_rech_amt_data_9', 'date_of_last_rech_data_9')

impute_zero_based_on_col('arpu_3g_6', 'date_of_last_rech_data_6')
impute_zero_based_on_col('arpu_3g_7', 'date_of_last_rech_data_7')
impute_zero_based_on_col('arpu_3g_8', 'date_of_last_rech_data_8')
impute_zero_based_on_col('arpu_3g_9', 'date_of_last_rech_data_9')

impute_zero_based_on_col('arpu_2g_6', 'date_of_last_rech_data_6')
impute_zero_based_on_col('arpu_2g_7', 'date_of_last_rech_data_7')
impute_zero_based_on_col('arpu_2g_8', 'date_of_last_rech_data_8')
impute_zero_based_on_col('arpu_2g_9', 'date_of_last_rech_data_9')

impute_zero_based_on_col('night_pck_user_6', 'date_of_last_rech_data_6')
impute_zero_based_on_col('night_pck_user_7', 'date_of_last_rech_data_7')
impute_zero_based_on_col('night_pck_user_8', 'date_of_last_rech_data_8')
impute_zero_based_on_col('night_pck_user_9', 'date_of_last_rech_data_9')

In [9]:
calc_null_per()

loc_og_t2o_mou               1.018010
std_og_t2o_mou               1.018010
loc_ic_t2o_mou               1.018010
last_date_of_month_7         0.601006
last_date_of_month_8         1.100011
last_date_of_month_9         1.659017
onnet_mou_6                  3.937039
onnet_mou_7                  3.859039
onnet_mou_8                  5.378054
onnet_mou_9                  7.745077
offnet_mou_6                 3.937039
offnet_mou_7                 3.859039
offnet_mou_8                 5.378054
offnet_mou_9                 7.745077
roam_ic_mou_6                3.937039
roam_ic_mou_7                3.859039
roam_ic_mou_8                5.378054
roam_ic_mou_9                7.745077
roam_og_mou_6                3.937039
roam_og_mou_7                3.859039
roam_og_mou_8                5.378054
roam_og_mou_9                7.745077
loc_og_t2t_mou_6             3.937039
loc_og_t2t_mou_7             3.859039
loc_og_t2t_mou_8             5.378054
loc_og_t2t_mou_9             7.745077
loc_og_t2m_m

In [10]:
df_churn = df_churn.drop(
    ['date_of_last_rech_data_6', 'date_of_last_rech_data_7', 'date_of_last_rech_data_8', 'date_of_last_rech_data_9'],
             axis = 1)

In [11]:
calc_null_per()

loc_og_t2o_mou           1.018010
std_og_t2o_mou           1.018010
loc_ic_t2o_mou           1.018010
last_date_of_month_7     0.601006
last_date_of_month_8     1.100011
last_date_of_month_9     1.659017
onnet_mou_6              3.937039
onnet_mou_7              3.859039
onnet_mou_8              5.378054
onnet_mou_9              7.745077
offnet_mou_6             3.937039
offnet_mou_7             3.859039
offnet_mou_8             5.378054
offnet_mou_9             7.745077
roam_ic_mou_6            3.937039
roam_ic_mou_7            3.859039
roam_ic_mou_8            5.378054
roam_ic_mou_9            7.745077
roam_og_mou_6            3.937039
roam_og_mou_7            3.859039
roam_og_mou_8            5.378054
roam_og_mou_9            7.745077
loc_og_t2t_mou_6         3.937039
loc_og_t2t_mou_7         3.859039
loc_og_t2t_mou_8         5.378054
loc_og_t2t_mou_9         7.745077
loc_og_t2m_mou_6         3.937039
loc_og_t2m_mou_7         3.859039
loc_og_t2m_mou_8         5.378054
loc_og_t2m_mou

#### Drop high null percentage columns

In [12]:
cols_to_drop = ['fb_user_6', 'fb_user_7', 'fb_user_7', 'fb_user_9']
df_churn = df_churn.drop(cols_to_drop, axis=1)

In [13]:
calc_null_per()

loc_og_t2o_mou           1.018010
std_og_t2o_mou           1.018010
loc_ic_t2o_mou           1.018010
last_date_of_month_7     0.601006
last_date_of_month_8     1.100011
last_date_of_month_9     1.659017
onnet_mou_6              3.937039
onnet_mou_7              3.859039
onnet_mou_8              5.378054
onnet_mou_9              7.745077
offnet_mou_6             3.937039
offnet_mou_7             3.859039
offnet_mou_8             5.378054
offnet_mou_9             7.745077
roam_ic_mou_6            3.937039
roam_ic_mou_7            3.859039
roam_ic_mou_8            5.378054
roam_ic_mou_9            7.745077
roam_og_mou_6            3.937039
roam_og_mou_7            3.859039
roam_og_mou_8            5.378054
roam_og_mou_9            7.745077
loc_og_t2t_mou_6         3.937039
loc_og_t2t_mou_7         3.859039
loc_og_t2t_mou_8         5.378054
loc_og_t2t_mou_9         7.745077
loc_og_t2m_mou_6         3.937039
loc_og_t2m_mou_7         3.859039
loc_og_t2m_mou_8         5.378054
loc_og_t2m_mou

In [None]:
##col_to_impute_zero =

In [None]:
#col_to_impute_median = ['onnet_mou_6', 'onnet_mou_7', 'onnet_mou_8', 'onnet_mou_9', 'offnet_mou_6', 'offnet_mou_7', 
#                        'offnet_mou_8', 'offnet_mou_9' ]