In [None]:
#importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [None]:
import warnings
warnings.filterwarnings('ignore')

# Step 1: Sanity checks on Data

In [None]:
df = pd.read_csv('telecom_churn_data.csv')
df.head()

In [None]:
df.info()

In [None]:
# let get the shape of the data frame before we start
df.shape

In [None]:
# let get the shape of the data frame before we start
# currently commenting this out as it is taking a long time for the 1 lakh records.
# df.describe  

# Step 2: Data cleaning and Preparation

## Let us start with Data cleaning and prepare data for analysis

### Start with the treatment of Null Values - more than 74% drop the column, less than 3%, replace null with 0, Rest impute with mean/median/mode according to the type of the column

In [None]:

#finding the null percentage value in the columns
columns = df.columns
percent_missing_Nulls = (df.isnull().sum() * 100) / df.shape[0]

missing_value_df = pd.DataFrame({'column_name': columns,
                                 'percent_missing': percent_missing_Nulls})

#dropping the columns whose missing values is more than 70%
threshold_percentage = 74

filtered_cols = list(missing_value_df[missing_value_df.percent_missing > threshold_percentage].column_name)

df = df.drop(filtered_cols, axis=1)

In [None]:
#checking the data shape
df.shape

In [None]:
df.isnull().sum()

In [None]:
100*df.isnull().sum()/df.shape[0]

In [None]:

    #finding the null percentage value in the columns
columns = df.columns
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': columns,
                                 'percent_missing': percent_missing})

#dropping the columns whose missing values is more than 70%
percentage = 3
filtered_cols = list(missing_value_df[(missing_value_df.percent_missing<percentage) & (missing_value_df.percent_missing>0) ].column_name)

filtered_cols

### <font color = "blue"> Inference : the first 3 are number value, rest are date values and we will have different treatment for dates </font>

In [None]:
### imputing the null value
num_cols = df.select_dtypes('number').columns
num_cols

In [None]:
#imputing the null values with 0
df['loc_og_t2o_mou'].fillna(0.0,inplace=True)
df['std_og_t2o_mou'].fillna(0.0,inplace=True)
df['loc_ic_t2o_mou'].fillna(0.0,inplace=True)


In [None]:
float_cols = df.select_dtypes('float').columns

In [None]:
#filling the last data of each month where null values are present.
df['last_date_of_month_7'].fillna('7/31/2014',inplace=True)
df['last_date_of_month_8'].fillna('8/31/2014',inplace=True)
df['last_date_of_month_9'].fillna('9/30/2014',inplace=True)

In [None]:
def impute_df(df, col):
    if df[col].dtype == "float":
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
##filling the nan values with median value

impute_df(df,'onnet_mou_6')
impute_df(df,'onnet_mou_7')
impute_df(df,'onnet_mou_8')
impute_df(df,'onnet_mou_9')

In [None]:
##filling the nan values with median values

impute_df(df,'offnet_mou_6')
impute_df(df,'offnet_mou_7')
impute_df(df,'offnet_mou_8')
impute_df(df,'offnet_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'roam_ic_mou_6')
impute_df(df,'roam_ic_mou_7')
impute_df(df,'roam_ic_mou_8')
impute_df(df,'roam_ic_mou_9')

In [None]:
##filling the nan values with median values

impute_df(df,'roam_og_mou_6')
impute_df(df,'roam_og_mou_7')
impute_df(df,'roam_og_mou_8')
impute_df(df,'roam_og_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'loc_og_t2t_mou_6')
impute_df(df,'loc_og_t2t_mou_7')
impute_df(df,'loc_og_t2t_mou_8')
impute_df(df,'loc_og_t2t_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'loc_og_t2t_mou_6')
impute_df(df,'loc_og_t2t_mou_7')
impute_df(df,'loc_og_t2t_mou_8')
impute_df(df,'loc_og_t2t_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'loc_og_t2t_mou_6')
impute_df(df,'loc_og_t2t_mou_7')
impute_df(df,'loc_og_t2t_mou_8')
impute_df(df,'loc_og_t2t_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'loc_og_t2m_mou_6')
impute_df(df,'loc_og_t2m_mou_7')
impute_df(df,'loc_og_t2m_mou_8')
impute_df(df,'loc_og_t2m_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'loc_og_t2f_mou_6')
impute_df(df,'loc_og_t2f_mou_7')
impute_df(df,'loc_og_t2f_mou_8')
impute_df(df,'loc_og_t2f_mou_9')

In [None]:
##filling the nan values with median value
impute_df(df,'loc_og_t2f_mou_6')
impute_df(df,'loc_og_t2f_mou_7')
impute_df(df,'loc_og_t2f_mou_8')
impute_df(df,'loc_og_t2f_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'loc_og_t2c_mou_6')
impute_df(df,'loc_og_t2c_mou_7')
impute_df(df,'loc_og_t2c_mou_8')
impute_df(df,'loc_og_t2c_mou_9')

In [None]:
##filling the nan values with median values

impute_df(df,'loc_og_mou_6')
impute_df(df,'loc_og_mou_7')
impute_df(df,'loc_og_mou_8')
impute_df(df,'loc_og_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_og_t2t_mou_6')
impute_df(df,'std_og_t2t_mou_7')
impute_df(df,'std_og_t2t_mou_8')
impute_df(df,'std_og_t2t_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_og_t2t_mou_6')
impute_df(df,'std_og_t2t_mou_7')
impute_df(df,'std_og_t2t_mou_8')
impute_df(df,'std_og_t2t_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_og_t2m_mou_6')
impute_df(df,'std_og_t2m_mou_7')
impute_df(df,'std_og_t2m_mou_8')
impute_df(df,'std_og_t2m_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_og_t2f_mou_6')
impute_df(df,'std_og_t2f_mou_7')
impute_df(df,'std_og_t2f_mou_8')
impute_df(df,'std_og_t2f_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_og_t2c_mou_6')
impute_df(df,'std_og_t2c_mou_7')
impute_df(df,'std_og_t2c_mou_8')
impute_df(df,'std_og_t2c_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_og_mou_6')
impute_df(df,'std_og_mou_7')
impute_df(df,'std_og_mou_8')
impute_df(df,'std_og_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'isd_og_mou_6')
impute_df(df,'isd_og_mou_7')
impute_df(df,'isd_og_mou_8')
impute_df(df,'isd_og_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'spl_og_mou_6')
impute_df(df,'spl_og_mou_7')
impute_df(df,'spl_og_mou_8')
impute_df(df,'spl_og_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'og_others_6')
impute_df(df,'og_others_7')
impute_df(df,'og_others_8')
impute_df(df,'og_others_9')

In [None]:
##filling the nan values with median values
impute_df(df,'loc_ic_t2t_mou_6')
impute_df(df,'loc_ic_t2t_mou_7')
impute_df(df,'loc_ic_t2t_mou_8')
impute_df(df,'loc_ic_t2t_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'loc_ic_t2m_mou_6')
impute_df(df,'loc_ic_t2m_mou_7')
impute_df(df,'loc_ic_t2m_mou_8')
impute_df(df,'loc_ic_t2m_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'loc_ic_t2f_mou_6')
impute_df(df,'loc_ic_t2f_mou_7')
impute_df(df,'loc_ic_t2f_mou_8')
impute_df(df,'loc_ic_t2f_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'loc_ic_mou_6')
impute_df(df,'loc_ic_mou_7')
impute_df(df,'loc_ic_mou_8')
impute_df(df,'loc_ic_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_ic_t2t_mou_6')
impute_df(df,'std_ic_t2t_mou_7')
impute_df(df,'std_ic_t2t_mou_8')
impute_df(df,'std_ic_t2t_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_ic_t2m_mou_6')
impute_df(df,'std_ic_t2m_mou_7')
impute_df(df,'std_ic_t2m_mou_8')
impute_df(df,'std_ic_t2m_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_ic_t2f_mou_6')
impute_df(df,'std_ic_t2f_mou_7')
impute_df(df,'std_ic_t2f_mou_8')
impute_df(df,'std_ic_t2f_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_ic_t2t_mou_6')
impute_df(df,'std_ic_t2t_mou_7')
impute_df(df,'std_ic_t2t_mou_8')
impute_df(df,'std_ic_t2t_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_ic_t2o_mou_6')
impute_df(df,'std_ic_t2o_mou_7')
impute_df(df,'std_ic_t2o_mou_8')
impute_df(df,'std_ic_t2o_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'std_ic_mou_6')
impute_df(df,'std_ic_mou_7')
impute_df(df,'std_ic_mou_8')
impute_df(df,'std_ic_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'spl_ic_mou_6')
impute_df(df,'spl_ic_mou_7')
impute_df(df,'spl_ic_mou_8')
impute_df(df,'spl_ic_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'isd_ic_mou_6')
impute_df(df,'isd_ic_mou_7')
impute_df(df,'isd_ic_mou_8')
impute_df(df,'isd_ic_mou_9')

In [None]:
##filling the nan values with median values
impute_df(df,'ic_others_6')
impute_df(df,'ic_others_7')
impute_df(df,'ic_others_8')
impute_df(df,'ic_others_9')

In [None]:
#computing the last days of each month
df['date_of_last_rech_6'].fillna('6/30/2014',inplace=True)
df['date_of_last_rech_7'].fillna('7/31/2014',inplace=True)
df['date_of_last_rech_8'].fillna('8/31/2014',inplace=True)
df['date_of_last_rech_9'].fillna('9/30/2014',inplace=True)

In [None]:
##dropping circle_id
df.drop(columns=['circle_id'],inplace=True)

In [None]:
##converting the column mobile_number to object
df['mobile_number'] = df['mobile_number'].astype('object')

### Deriving Variables

In [None]:
##finding the average recharge amount for the month 6 and 7
df['avg_rech_amt_6_7'] = (df['total_rech_amt_6']+df['total_rech_amt_7'])/2

In [None]:
### finding the average revenue generated per user for the month of 6 and 7

df['avg_arpu_6_7'] = round((df['arpu_6']+df['arpu_7'])/2,2)

In [None]:
cut_off = df['avg_rech_amt_6_7'].quantile(.7)

In [None]:
df['HVC'] = df['avg_rech_amt_6_7'].map(lambda x: 1 if x > cut_off else 0)


In [None]:
df.HVC.sum()#checkpoint reached

In [None]:
df_hvc = df[df['HVC'] == 1]

In [None]:
df_hvc.shape

In [None]:
def flag_df(df):
    if(df['total_ic_mou_9'] == 0) and (df['total_og_mou_9'] == 0) and (df['vol_2g_mb_9'] == 0) and (df['vol_3g_mb_9'] == 0):
        return 1 
    else:
        return 0

    
df_hvc['Churn'] = df_hvc.apply(flag_df,axis=1)

In [None]:
df_hvc['Churn'].value_counts()

#### Inference: the number of churn cases is 2539

## <font color = "red"> Inference: This seems to be a very imbalanced data. So, we first need to get a balanced data for deriving a model. In the next few steps we shall balance the record </font>

In [None]:
# get the churn counts
churn_count_0, churn_count_1 = df_hvc['Churn'].value_counts()
churn_count_0, churn_count_1 

# Separate class

churn_class_0 = df_hvc[df_hvc['Churn'] == 0]
churn_class_1 = df_hvc[df_hvc['Churn'] == 1] # print the shape of the class

print('class 0:', churn_class_0.shape)
print('class 1:', churn_class_1.shape)

# take a random szmple of the size of the number of churn records
churn_class_0_cut = churn_class_0.sample(churn_count_1)

df_hvc_balanced = pd.concat([churn_class_0_cut, churn_class_1], axis=0)

print("total class of 1 and0:",df_hvc_balanced['Churn'].value_counts())# plot the count after under-sampeling

df_hvc_balanced['Churn'].value_counts().plot(kind='bar', title='count (target)')



In [None]:
df_hvc_balanced.head()

In [None]:
#churn customers
df_hvc_balanced[['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9','Churn']]

In [None]:
###removing all the columns of the _9 month
df_hvc_balanced = df_hvc_balanced[df_hvc_balanced.columns.drop(list(df_hvc_balanced.filter(regex='_9')))]

In [None]:
df_hvc_balanced.shape

In [None]:
df_hvc_balanced.head()

### look for object type columns as they cannot be used in model building, and keep only numeric variables.

In [None]:
df_hvc_balanced.isnull().sum()

obj_cols = df_hvc_balanced.select_dtypes('object').columns
obj_cols

df_hvc_balanced.drop(columns=obj_cols,inplace=True)



### look for NaN values and replace by 0

In [None]:

df_hvc_balanced.isna().any()

In [None]:
df_hvc_balanced['total_rech_data_8'] = df_hvc_balanced['total_rech_data_8'].fillna(0)
df_hvc_balanced['max_rech_data_8'] = df_hvc_balanced['max_rech_data_8'].fillna(0)
df_hvc_balanced['count_rech_2g_8'] = df_hvc_balanced['count_rech_2g_8'].fillna(0)
df_hvc_balanced['count_rech_3g_8'] = df_hvc_balanced['count_rech_3g_8'].fillna(0)
df_hvc_balanced['av_rech_amt_data_8'] = df_hvc_balanced['av_rech_amt_data_8'].fillna(0)
df_hvc_balanced['arpu_3g_8'] = df_hvc_balanced['arpu_3g_8'].fillna(0)
df_hvc_balanced['night_pck_user_8'] = df_hvc_balanced['night_pck_user_8'].fillna(0)
df_hvc_balanced['arpu_2g_8'] = df_hvc_balanced['arpu_2g_8'].fillna(0)
df_hvc_balanced['fb_user_8'] = df_hvc_balanced['fb_user_8'].fillna(0)
df_hvc_balanced.isnull().sum().sum()


# Step 3:  EDA

In [None]:
# lets start analysis of the data
# let us check how many HV customers and also what is the relationship between churn and HVC
df["HVC"].value_counts().plot.barh()
df["HVC"].value_counts(normalize=True)


### <font color = "blue" > Inference : we need to concentrate on 30% of the customers who can be termed high value </font>

In [None]:
# next let us look at what is the relationship between churn and HVC
# let us check how many Churn customers and also what is the relationship between churn and HVC
df_hvc_balanced["Churn"].value_counts().plot.barh()
df_hvc_balanced["Churn"].value_counts()


### <font color = "blue" > Inference :  We have a balanced dataset for building a model. next lets check for outliers and other preliminary data which could give us some insights</font>

In [None]:
# Checking for outliers in the continuous variables
temp_columns = df_hvc_balanced[['avg_arpu_6_7','avg_rech_amt_6_7']]
# Checking outliers at 25%, 50%, 75%, 90%, 95% and 99%
temp_columns.describe(percentiles=[.25, .5, .75, .90, .95, .99])

### <font color = "blue"> Inference : Among the HVC itself there is a huge variation. there are chances of outliers - we will put a box plot to analyse this. </font>

In [None]:

sns.boxplot( y=df_hvc_balanced["avg_arpu_6_7"] );
plt.show()

In [None]:

sns.boxplot( y=df_hvc_balanced["avg_rech_amt_6_7"] );
plt.show()

### <font color = "blue"> Inference :There seems to some outliers and we really need to remove those records so that they do not skew the analysis</font>

In [None]:
df_hvc_balanced = df_hvc_balanced[df_hvc_balanced['avg_rech_amt_6_7']  <= 5000]  

In [None]:
# Checking for outliers in the continuous variables
temp_columns = df_hvc_balanced[['avg_arpu_6_7','avg_rech_amt_6_7']]
# Checking outliers at 25%, 50%, 75%, 90%, 95% and 99%
temp_columns.describe(percentiles=[.25, .5, .75, .90, .95, .99])

In [None]:

sns.boxplot( y=df_hvc_balanced["avg_rech_amt_6_7"] );
plt.show()

In [None]:

sns.histplot(data=df_hvc_balanced, x="avg_rech_amt_6_7")
plt.show()

### <font color = "blue" > Inference : there are very less records beyond 2000, so we will remove them too, we will try with the value counts by binning them, then take the call of deleting, if the records are very less in number we shall remove them </font>

In [None]:
df_hvc_balanced["avg_rech_amt_6_7"].value_counts(bins=20)

In [None]:
df_hvc_balanced = df_hvc_balanced[df_hvc_balanced['avg_rech_amt_6_7']  <= 2000]  

In [None]:
df_hvc_balanced["avg_rech_amt_6_7"].value_counts(bins=200)

In [None]:

sns.boxplot( y=df_hvc_balanced["avg_rech_amt_6_7"].value_counts(bins=200));
plt.show()

### <font color = "blue" > Inference : now the data looks good. there will be people who recharge for near 2000, so let us have these records for further processing </font>

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(df_hvc_balanced.corr(),annot = True)
plt.show()

### <font color = "blue"> Inference : Nothing is infereable from this - we need to reduce the number of variables. it is better we perfrom tis after we find the top 10-15 variables using the rfe model </font>
    

In [None]:
# Checking for outliers in the continuous variables
out_check = df_hvc_balanced[['avg_rech_amt_6_7','avg_arpu_6_7']]
# Checking outliers at 25%, 50%, 75%, 90%, 95% and 99%
out_check.describe(percentiles=[.25, .5, .75, .90, .95, .99])

In [None]:
# Data Cleaning - outlier, Imputation, null values removal - (rishab) 
# Data Preparation - derived variables(hvc, churn, arpu) - (rishab)
# EDA (univariate,bivariate, multivariate) (arthi)
# model - (Logistic(arthi) , decision trees(arthi), random forest (rishab))

### Step 4: Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Putting feature variable to X
X = df_hvc_balanced.drop(['Churn'], axis=1)

X.head()

In [None]:
# Putting response variable to y
y = df_hvc_balanced['Churn']

y.head()

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Step 5: Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

### Step 6: Model Building

In [None]:
import statsmodels.api as sm

In [None]:
df_hvc_balanced.shape

In [None]:
# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

### Step 7: Feature Selection Using RFE

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

from sklearn.feature_selection import RFE
rfe = RFE(logreg, 15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)
rfe.support_

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
X_train.columns[~rfe.support_]

In [None]:
# assesing the stats model

X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(X_train_sm.corr(),annot = True)
plt.show()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Churn':y_train.values, 'Churn_Prob':y_train_pred})
y_train_pred_final['mobile_number'] = y_train.index
y_train_pred_final.head()

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
y_train_pred_final.head(50)

## Step 8 : checking the confusion matirx and the other scores of accuracy, sensitivity, specificity

In [None]:
from sklearn import metrics

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.predicted))

### <font color="blue"> Inference : Thats a good level of accuracy. but we will still check th vif and see if some variables can be removed to get a better predictability </font>


#### Checking VIFs

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### <font color="blue"> Inference:  There are a few variables with high VIF. It's best to drop these variables as they aren't helping much with prediction and unnecessarily making the model complex. So let's start by dropping that. </font>

In [None]:
#col = col.drop(['total_ic_mou_8','loc_ic_mou_8','total_og_mou_8','std_ic_mou_8','offnet_mou_8'], 1)
cols=[]

for i in range(0,vif.shape[0]):
    if vif['VIF'][i] > 20:
        cols.append(vif['Features'][i])
        

col = col.drop(cols, 1)

col


In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm3.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

y_train_pred[:10]




y_train_pred_final['Churn_Prob'] = y_train_pred
# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['predicted'] = y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif


### there seems to be no more impacts and multicollinearity - so we can proceeed with this split.

In [None]:
# Let's check the overall accuracy.
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.predicted )
print(confusion)
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.predicted))

### <font color ="blue">there seems to be no more impacts and multicollinearity - so we can proceeed with this split.</font>

### Metrics beyond simply accuracy

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
print(" Sensitivity (Recall): " , TP / float(TP+FN))
# Calculate false postive rate - predicting churn when customer does not have churned
print(" False positive rate:" , FP / float(TN+FP))
# positive predictive value 
print (" Precision ", TP / float(TP+FP))
# Negative predictive value
print (" True negatives rate:",TN / float(TN+ FN))

## Step 9: Plotting ROC, Tradeoffs, and Threshold cutoff

### Plotting the ROC 
An ROC curve demonstrates several things:

- It shows the tradeoff between sensitivity and specificity (any increase in sensitivity will be accompanied by a decrease in specificity).
- The closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.
- The closer the curve comes to the 45-degree diagonal of the ROC space, the less accurate the test.


In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Churn, y_train_pred_final.Churn_Prob, drop_intermediate = False )

draw_roc(y_train_pred_final.Churn, y_train_pred_final.Churn_Prob)

### Step 10: Finding Optimal Cutoff Point
Optimal cutoff probability is that prob where we get balanced sensitivity and specificity

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

### <font color="blue"> inference :  the best results can be seen at between .5 and .6 where the sensitivty and specificty is .83,.73 and .77 and .83 respectiviely. </font>

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

#### <font color="blue"> Inference  : From the curve above, ~.55 is the optimum point to take it as a cutoff probability. </font>

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Churn_Prob.map( lambda x: 1 if x > 0.55 else 0)

y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.final_predicted)

confusion2 = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.final_predicted )
confusion2

TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

Recall = TP / float(TP+FN)
Precision = TP / float(TP+FP)

# Let's see the sensitivity of our logistic regression model
print(" Sensitivity (Recall): " , Recall)
# Calculate false postive rate - predicting churn when customer does not have churned
print(" False positive rate:" , FP / float(TN+FP))
# positive predictive value 
print (" Precision :", Precision)
# Negative predictive value
print (" True negatives rate:",TN / float(TN+ FN))

In [None]:
## calclulate the F1 score
F1 = 2 * (Precision * Recall)/(Precision + Recall)
F1

### <font color="blue"> Inference : Not a bad F-score ! looks like the model is working </font>

### Recall to Precision Tradeoff

In [None]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

In [None]:
print("Precision score: ", precision_score(y_train_pred_final.Churn, y_train_pred_final.predicted))
print("Recall score :", recall_score(y_train_pred_final.Churn, y_train_pred_final.predicted))
y_train_pred_final.Churn, y_train_pred_final.predicted

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final.Churn, y_train_pred_final.Churn_Prob)

In [None]:
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

### <font color="blue"> Inference :  the thresholds correlate to ~.55  </font>

## Step 10: Making predictions on the test set

In [None]:
X_test_temp = X_test[col]
X_test_temp.head()

In [None]:
X_test_sm = sm.add_constant(X_test_temp)
y_test_pred = res.predict(X_test_sm)
y_test_pred[:10]

In [None]:

# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)

y_pred_1.head()

In [None]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [None]:
# Putting CustID to index
y_test_df['mobile_number'] = y_test_df.index

In [None]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)
y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Churn_Prob'})

In [None]:
# Rearranging the columns
y_pred_final = y_pred_final.reindex(['mobile_number','Churn','Churn_Prob'], axis=1)

y_pred_final.head()

In [None]:
y_pred_final['final_predicted'] = y_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.42 else 0)
y_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_pred_final.Churn, y_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final.Churn, y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
print(" Sensitivity (Recall): " , TP / float(TP+FN))
# Calculate false postive rate - predicting churn when customer does not have churned
print(" False positive rate:" , FP / float(TN+FP))
# positive predictive value 
print (" Precision ", TP / float(TP+FP))
# Negative predictive value
print (" True negatives rate:",TN / float(TN+ FN))

### <font color="blue"> Inference : the current model is has a good prediction stats of 88% Sensitivity and True negatives detection as 85%  and a precision of 72%  and the variables that make a difference are  </font>
    

total_rech_num_8	
total_rech_num_7	
total_ic_mou_8	
std_ic_mou_8	
isd_ic_mou_8	
roam_og_mou_8	
ic_others_8	
sep_vbc_3g	
    
And with this model applied to the records and the prob threshold value of more than .55.

# Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(max_depth=3)


In [None]:
dt.fit(X_train, y_train)

In [None]:
# Importing required packages for visualization
import os

os.environ['PATH'] = os.environ['PATH']

from IPython.display import Image  
#from sklearn.externals.six import StringIO  
from six import StringIO
from sklearn.tree import export_graphviz
import pydotplus, graphviz


In [None]:
# plotting tree with max_depth=3
dot_data = StringIO()  

export_graphviz(dt, out_file=dot_data, filled=True, rounded=True,
                feature_names=X.columns, 
                class_names=['Churn', "No Churn"])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
#Image(graph.create_png(),width=800,height=900)
#graph.write_pdf("dt_heartdisease.pdf")

## INference:
1. the main variables that helps us decide are:
- total_ic_mou_8
- last_day_rch_amt_8
- total_og_mou_8
- total_rech_amt_8
- loc_og_mou_8
- roam_og_mou_8
- arpu_2g_8

####  paths to take

if we look at this tree, to find the cases for churn the path to be followed are:
- total_ic_mou_8 > 39.45, roam_og_mou_8<=0.015, total_rech_amt_8 > 267

and for non churns
- total_ic_mou_8 <=39.45, arpu_8 <=289.605, total_ic_mou_8 <= 3.74

### <font color = "blue"> these are based on the fact gini gains between the nodes </font>

### Let us now check on the feature importances 

In [None]:
dt.feature_importances_

In [None]:
imp_df = pd.DataFrame({
    "Var name":X_train.columns,
    "Imp":dt.feature_importances_
})

imp_df.sort_values(by="Imp",ascending=False)

### <font color = "blue"> Inference: the top features for the model can be considered from this list </font>

## let us move on to find the accuracy and the outcome of the confusion matrix

In [None]:
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
print(accuracy_score(y_train, y_train_pred))
confusion = confusion_matrix(y_train, y_train_pred)



In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

Recall = TP / float(TP+FN)
Precision = TP / float(TP+FP)

# Let's see the sensitivity of our logistic regression model
print(" Sensitivity (Recall): " , Recall)
# Calculate false postive rate - predicting churn when customer does not have churned
print(" False positive rate:" , FP / float(TN+FP))
# positive predictive value 
print (" Precision :", Precision)
# Negative predictive value
print (" True negatives rate:",TN / float(TN+ FN))

### <font color="blue"> Inference : This model has a good sensitivity and precision and able to predict true positives, te=rue negative very well. We shall check on thr test data perfromance too and then move on to checking if the tuning of hyper parameters give even better results. </font>

In [None]:
print(accuracy_score(y_test, y_test_pred))
confusion = confusion_matrix(y_test, y_test_pred)

confusion 
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

Recall = TP / float(TP+FN)
Precision = TP / float(TP+FP)

# Let's see the sensitivity of our logistic regression model
print(" Sensitivity (Recall): " , Recall)
# Calculate false postive rate - predicting churn when customer does not have churned
print(" False positive rate:" , FP / float(TN+FP))
# positive predictive value 
print (" Precision :", Precision)
# Negative predictive value
print (" True negatives rate:",TN / float(TN+ FN))

### <font color = "blue"> Inference : this model is good in predicting the churn vs no churns for the test data too very well</font>

In [None]:
y_train.value_counts()

### <font color="blue"> Let us calculate the impurity measures </font>

Let us find the classifcation of errors
Churn = 0    no of cases - 18930
Churn = 1    no of cases - 1745



p(0) = 18930/20675 = .91
p(1) = 1745/20675 = .08

Since P(max) here is .91 : we can say that classification error could possibly become (1-.91) = .09. the chance of wrongly predicting a wrong no churn is .09.. this means incase we are saying all of them will not churn, we could possibly be wrong .09 % of times


## Let us start playing with the hyperparameters. 

In [None]:
def get_dt_graph(dt_classifier):
    dot_data = StringIO()
    export_graphviz(dt_classifier, out_file=dot_data, filled=True,rounded=True,
                    feature_names=X.columns, 
                    class_names=['Churn', "No Churn"])
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    return graph

In [None]:
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

### <font color = "violet"> Case 1 : all default parameters </font>

In [None]:
dt_default = DecisionTreeClassifier(random_state=42)
dt_default.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_default)
Image(gph.create_png())

In [None]:
evaluate_model(dt_default)

### <font color="blue"> Inference : this is a overfit. iwth train accuracy as 1. but test accuracy is good enough at .78. we will try the other parameters now</font>

## <font color="violet"> Case 2 - max_depth = 3 </font>

In [None]:
dt_depth = DecisionTreeClassifier(max_depth=3)
dt_depth.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_depth) 
Image(gph.create_png())

In [None]:
evaluate_model(dt_depth)

## <font color="violet"> Case 3 - max_samples_split = 20 </font>

In [None]:
dt_min_split = DecisionTreeClassifier(min_samples_split=20)
dt_min_split.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_min_split) 
Image(gph.create_png())

In [None]:
evaluate_model(dt_min_split)

## <font color="violet"> Case 4 - max_samples_leaf = 20 </font>

In [None]:
dt_min_leaf = DecisionTreeClassifier(min_samples_leaf=20, random_state=42)
dt_min_leaf.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_min_leaf)
Image(gph.create_png())

In [None]:
evaluate_model(dt_min_leaf)

### Using Entropy instead of Gini

## <font color="violet"> Case 5 - for the impurity checks, instead of gini let us play with the entropy values </font>

In [None]:
dt_min_leaf_entropy = DecisionTreeClassifier(min_samples_leaf=20, random_state=42, criterion="entropy")
dt_min_leaf_entropy.fit(X_train, y_train)

In [None]:
gph = get_dt_graph(dt_min_leaf_entropy)
Image(gph.create_png())

In [None]:
evaluate_model(dt_min_leaf_entropy)

## <font color="violet"> Case 6 -- enough of trying one by one, let us try all the posible values for the hyperparamter vaiables and check on the results once for all using the gridsearch </font>


In [None]:
dt = DecisionTreeClassifier(random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [None]:
%%time
grid_search.fit(X_train, y_train)

In [None]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

In [None]:
score_df.nlargest(5,"mean_test_score")

In [None]:
grid_search.best_estimator_

In [None]:
dt_best = grid_search.best_estimator_
dt_best

In [None]:
evaluate_model(dt_best)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, dt_best.predict(X_test)))

In [None]:
gph = get_dt_graph(dt_best)
Image(gph.create_png())

## <font color="blue"> Final inference: this tree with depth 5 and sample_leaf of 10 gives the best results. let us now again see which are the varibles that matter </font>

the variables that now matter are 
- total_ic_mou_8
- arpu_8
- vol_2g_mb_8
- Onnet_mou_8
- aon
- last_day_rch_amt_8
- total_rech_amt_8
- roam_og_mou_8

for the non churns the best path to take:
total_ic_mou_8 <=39.475 and arp_8<= 289.685 and total_ic_mou_8<=3.74 and vol_2g_mb_8 <= . 395 and onnet_mou_8 >.036

for churns
total_ic_mou_8 > 39.475 and roam_og_mou_8 <= .05 and total_rech_amt_8 > 267 and last_day_rch_amt_8 <=3.5 and aon > 417.5

# Random Forest


In [None]:

#importing libraries

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42, n_estimators=10, max_depth=3,oob_score=True)


In [None]:
X_train.head()

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf.estimators_[0]

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(rf,X_train,y_train)

In [None]:
sample_tree = rf.estimators_[4]

In [None]:
gph = get_dt_graph(sample_tree)
Image(gph.create_png(), width=700, height=700)

In [None]:
gph = get_dt_graph(rf.estimators_[2])
Image(gph.create_png(), width=700, height=700)

In [None]:
evaluate_model(rf)

#### Grid search for hyper-parameter tuning

In [None]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1,oob_score=True)

In [None]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [1, 2, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'max_features': [2,3,4],
    'n_estimators': [5,10,15,20,25]
}

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=classifier_rf, param_grid=params, 
                          cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [None]:
%%time
grid_search.fit(X,y)

In [None]:
rf_best = grid_search.best_estimator_
rf_best

In [None]:
rf_best.oob_score_

In [None]:
plot_roc_curve(rf_best,X_train,y_train)

In [None]:
evaluate_model(rf_best)

In [None]:
sample_tree = rf_best.estimators_[0]
sample_tree

In [None]:
gph = get_dt_graph(sample_tree)
Image(gph.create_png())

In [None]:
gph = get_dt_graph(rf_best.estimators_[0])
Image(gph.create_png(), height=600, width=600)

In [None]:
gph = get_dt_graph(rf_best.estimators_[10])
Image(gph.create_png(), height=600, width=600)

In [None]:
print(classification_report(y_test, rf_best.predict(X_test)))

### Variable importance in RandomForest and Decision trees

In [None]:
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": rf_best.feature_importances_
})

In [None]:
imp_df.sort_values(by="Imp", ascending=False,inplace=True)
imp_df.head(15)

## inference :  the following are the most important features that can help us predict the churn
107	max_rech_amt_8	0.058130
32	loc_og_mou_8	0.054670
110	last_day_rch_amt_8	0.047455
101	total_rech_num_8	0.043871
14	roam_ic_mou_8	0.040802
71	loc_ic_mou_8	0.035608
20	loc_og_t2t_mou_8	0.033673
86	std_ic_mou_8	0.033327
65	loc_ic_t2m_mou_8	0.033213

## <font color="Green"> Final word: With the 3 models we notice that the Random forests are much more reliable and has the best prediction as compared to the Decision tree or logistics regression. </font>
    
   ## Also, the balancing of the data was very important for getting the balanced results.