## Group Case Study : Telecom Churn

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
pd.options.display.max_columns = None


In [None]:
raw_data= pd.read_csv('../input/telecom-churn-data-set-for-the-south-asian-market/telecom_churn_data.csv')

In [None]:
raw_data.head()

In [None]:
print('Data has :{} Rows and :{} Columns'.format(raw_data.shape[0],raw_data.shape[1]))
num_col=raw_data.select_dtypes(exclude='O').columns
cat_col=raw_data.select_dtypes('O').columns

In [None]:
raw_data.describe()

In [None]:
def df_info(df,sort_key,flag=False):
        count_null=df.isnull().sum()
        null_per=100*(count_null/len(df))
        n_unique=df.nunique()
        data_types=df.dtypes
        res=pd.concat([count_null,null_per,n_unique,data_types],axis=1,keys=['Total','Percent','Unique','DataType','Var Type']).sort_values(by=sort_key,ascending=flag)
        res['Var_Type']=np.where(res['Unique']<20,"Categorical",np.where(res['DataType']=='object',"Categorical","Continious"))
        return res
def df_thrs(df,col,thrs):
    return df[df[col]>=thrs]

def bar_chart(df,features):
    df_churn=df[df.churn==1][features]
    df_nochurn=df[df.churn==0][features]
    pd.DataFrame([df_churn.mean()],index={'Churn'})
    pd.DataFrame([df_nochurn.mean()],index={'No Churn'})
    bar_chart=pd.concat([pd.DataFrame([df_churn.mean()],index={'Churn'}),pd.DataFrame([df_nochurn.mean()],index={'No Churn'})]).T
    bar_chart.plot.bar(figsize=(10,8),rot=90)
    
def uni_anaysis(df,col,hist=True):
   
    if hist:
        plt.figure(figsize = (16,6))
        plt.subplot(1,3,1)
        sns.distplot(df.loc[df['churn']==0,col],hist=True, color = 'g')
        sns.distplot(df.loc[df['churn']==1,col],hist=True, color='r')
        plt.legend(['Not Churn','Churn'])

        plt.subplot(1,3,2)
        plt.hist(df.loc[df['churn']==0,col],color = 'g')
        plt.hist(df.loc[df['churn']==1,col],color = 'r')
        plt.legend(['Not Churn','Churn'])

        plt.subplot(1,3,3)
        sns.boxplot(y=col, data=df, x='churn',palette =['g','r'])
    else:
        plt.figure(figsize = (12,6))
        plt.subplot(1,2,1)
        sns.distplot(df.loc[df['churn']==0,col],hist=True, color = 'g')
        sns.distplot(df.loc[df['churn']==1,col],hist=True, color='r')
        plt.legend(['Not Churn','Churn'])
        
        plt.subplot(1,2,2)
        sns.boxplot(y=col, data=df, x='churn',palette =['g','r'])
    plt.tight_layout   
    plt.show()
    


In [None]:
res=df_info(raw_data,'Percent')
res

In [None]:
pd.set_option('display.max_rows', 500)
df_thrs(res,'Percent',1)

### Analysis on columns having Recharge infomation

In [None]:
recharge_cols=raw_data.columns[raw_data.columns.str.contains('rech_data|rech_amt')]
print(recharge_cols)

##### Excluding dates from Recharge Columns

In [None]:
recharge_cols=[cols for cols in recharge_cols if 'date' not in cols]

In [None]:
print(recharge_cols)

#### Anlysis of recharge Amt and data columns having missing values

In [None]:
res=df_info(raw_data[recharge_cols],'Percent')
temp=df_thrs(res,'Percent',1)
impute_cols=temp.index
raw_data[impute_cols].describe()

#### The Min values are 1 or less than 1 which implies that subscribe had not done recharge, these values shall be replace with 0

##### Missing Values in Total_reach_data_x means customer have not done recharge for this period. We can analyse :

- Total Rechage Data
- Total Recharge Amount
- Average and Maximum Recharge amount for Data
- Last Date of Recharge

##### Replacing nulls Total Reacharge Data with 0

In [None]:
raw_data[impute_cols]=raw_data[impute_cols].fillna(0.0)

In [None]:
res=df_info(raw_data[recharge_cols],'Percent')
df_thrs(res,'Percent',1)

##### There is no missing value in recharge features

#### In churn prediction,  there are three phases of customer lifecycle :


- ‘good’ phase -Month 6 and 7
- ‘action’ phase Month 8
- ‘churn’ phase Month 9


##### Let's create new features:

- Total Data Recharge Amount by multiplying Total_data_recharge and Avg_data_recharge
- Average of Total Recharge (Total Reacharge Amt + Total Data Recharge) for good Phase (month 6 and 7)

In [None]:
for i in [6,7,8]:
    raw_data['total_rech_data_amt_'+str(i)]=raw_data['total_rech_data_'+str(i)]*raw_data['av_rech_amt_data_'+str(i)]

In [None]:
raw_data['total_avg_rech_amt_good_phase']=(raw_data['total_rech_data_amt_6']+raw_data['total_rech_amt_6']+raw_data['total_rech_data_amt_7']+raw_data['total_rech_amt_7'])/2

In [None]:
raw_data.head()

In [None]:
raw_data['total_rech_data_amt_6'].isnull().sum()

In [None]:
res=df_info(raw_data,'Percent')
temp=df_thrs(res,'Percent',70)

##### Dropping Features with more than 70% missing values

In [None]:
drop_col=temp.index

In [None]:
drop_col

In [None]:
raw_data.drop(drop_col,1,inplace=True)

##### Identify Haigh Value Customers as per defination provided in business problem:
    
 - Those who have recharged with an amount more than or equal to X, where X is the 70th percentile of the average recharge amount in the first two months (the good phase).

In [None]:
high_val_cutoff=raw_data['total_avg_rech_amt_good_phase'].quantile(0.7)
print(high_val_cutoff)

##### Subscribers with Total average recharge <478.0 will be excluded and High Value customers will extracted

In [None]:
df_high_val=raw_data[raw_data['total_avg_rech_amt_good_phase']>high_val_cutoff]
print('High Value Customer data has {} records and {} columns'.format(df_high_val.shape[0],df_high_val.shape[1]))

##### Tagging the churned customers (churn=1, Not churned=0) based on the churn phase(month 9) usages : Those who have not made any calls (either incoming or outgoing) AND have not used mobile internet even once in the churn phase

##### The attributes will be used  to tag churners are:

total_ic_mou_9

total_og_mou_9

vol_2g_mb_9

vol_3g_mb_9


In [None]:
churn_phase_usage_cols=['total_ic_mou_9','total_og_mou_9','vol_2g_mb_9','vol_3g_mb_9']

In [None]:
df_high_val['churn']=np.where(df_high_val['total_ic_mou_9']+df_high_val['total_og_mou_9']+df_high_val['vol_2g_mb_9']+df_high_val['vol_3g_mb_9']==0,1,0)

In [None]:
churn_per=round(100*df_high_val.churn.value_counts(normalize=True),2)

In [None]:
print('There are {}% Customers marked as Churn and {}% Customer as No churn'.format(churn_per[1],churn_per[0]))

##### Removing all the columns used to create label Churn/Not Churn

In [None]:
df_high_val.drop(churn_phase_usage_cols,1,inplace=True)

##### Dropping all the columns represnts month 9

In [None]:
col_month_9=df_high_val.columns[df_high_val.columns.str.contains('_9',regex=True)]
print(col_month_9)
df_high_val.drop(col_month_9,1,inplace=True)

##### Analyzing High value data frame 

In [None]:
res=df_info(df_high_val,'Percent')

In [None]:
df_analysis=res.copy()


In [None]:
drop_cat_col=df_analysis[(df_analysis.Var_Type=="Categorical") & (df_analysis.Unique==1)].index

In [None]:
drop_cat_col

In [None]:
df_high_val.drop(drop_cat_col,1,inplace=True)

In [None]:
df_analysis=df_info(df_high_val,'Percent')

In [None]:
cat_col=df_analysis[df_analysis.Var_Type=='Categorical'].index

In [None]:
for col in cat_col:
    var=100*df_high_val[col].value_counts(normalize=True)[0]
    print("Col {} variance is : {}".format(col,round(100-var,2)))

In [None]:
df_high_val.shape

In [None]:
date_col=df_high_val.columns[df_high_val.columns.str.contains('date')]

In [None]:
date_col

##### Dropping Date columns as all information is available columns labled as Month

In [None]:
df_high_val.drop(date_col,1,inplace=True)

In [None]:
df_analysis.drop(date_col,0,inplace=True)

In [None]:
df_high_val.head(3)

In [None]:
df_high_val.info(verbose=True)

In [None]:
df_analysis_cat=df_analysis[(df_analysis.Var_Type=="Categorical") & (df_analysis.Percent>=1) ]

In [None]:
df_analysis[df_analysis.Var_Type=="Continious"]

In [None]:
col_conti_null=df_analysis[df_analysis.Total>=1].index

##### replacing Null with 0 considering no usages as null

In [None]:
df_high_val[col_conti_null]=df_high_val[col_conti_null].fillna(0)

In [None]:
df_analysis=df_info(df_high_val,'Percent')

###### There are no Nulls

In [None]:
df_high_val.shape

In [None]:
df_high_val.describe(include='all')

##### arpu_6,arpu_7,arpu_8 contains negative values which may indicates a subscriber has defaulted. Adding new featured default_6,default_7,default_8 to analyse impact on churn

In [None]:
df_high_val['defaulter_6']=np.where(df_high_val.arpu_6<0,abs(df_high_val.arpu_6),0)
df_high_val['defaulter_7']=np.where(df_high_val.arpu_7<0,abs(df_high_val.arpu_7),0)
df_high_val['defaulter_8']=np.where(df_high_val.arpu_8<0,abs(df_high_val.arpu_8),0)
df_high_val['arpu_6']=np.where(df_high_val.arpu_6<0,0,df_high_val.arpu_6)
df_high_val['arpu_7']=np.where(df_high_val.arpu_7<0,0,df_high_val.arpu_7)
df_high_val['arpu_8']=np.where(df_high_val.arpu_8<0,0,df_high_val.arpu_8)

In [None]:
df_high_val.head()

In [None]:
df_high_val.defaulter_6.max(),df_high_val.defaulter_7.max(),df_high_val.defaulter_8.max()

###### identifying VBC Columns and renaming all with month number as suffix

In [None]:
vbc_col=df_high_val.columns[df_high_val.columns.str.contains('vbc',regex=True)]

In [None]:
vbc_col

In [None]:
df_high_val.rename(columns={'jun_vbc_3g':'vbc_3g_6','jul_vbc_3g':'vbc_3g_7','aug_vbc_3g':'vbc_3g_8','sep_vbc_3g':'vbc_3g_9'},inplace=True)

In [None]:
df_high_val.head(2)

###### droping VBC for 9th month as 9th monh data would not be required

In [None]:
df_high_val.drop('vbc_3g_9',1,inplace=True)

In [None]:
df_high_val.shape

In [None]:
og_cols_7=df_high_val.columns[df_high_val.columns.str.contains('.*_og.*mou_7')]

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(df_high_val[og_cols_7].corr(),cmap='Greens',annot=True)

###### There is high correlation between other fields and fields shown below:
    - total_og_mou 
    - std_og_mou
    - loc_og_mou
    


In [None]:
check=df_high_val.index[0:5]

##### let's check for five rows if Total Out going Mou and Total Incoimg Mou are sum of other Incoming and outgoing columns

In [None]:
for i in check:
    for j in [6,7,8]:
        print(df_high_val.loc[i,'total_og_mou_'+str(j)],":",df_high_val.loc[i,['loc_og_mou_'+str(j),'std_og_mou_'+str(j),'spl_og_mou_'+str(j),'isd_og_mou_'+str(j),'og_others_'+str(j)]].sum())

##### We can see that Total OG MOU is sum of Loc_og, std_og,spl_og,isd_og and og_other

#####  As above shown by example other mou columns has the value which are availbe as aggregate in other columns, to avoid collinearlity let's drop mou colums with individual data

In [None]:
mou_to_drop=df_high_val.columns[df_high_val.columns.str.contains('.*t2.*mou_',regex=True)]

In [None]:
mou_to_drop

In [None]:
print ('Total Mou columns to drop is : {}'.format(len(mou_to_drop)))

In [None]:
df_high_val.drop(mou_to_drop,1,inplace=True)

In [None]:
df_high_val.shape

In [None]:
df_high_val.head()

In [None]:
mou_cols=df_high_val.columns[df_high_val.columns.str.contains('mou')]

In [None]:
mou_cols

##### As per problem statement the 6 and 7 month is Good Phase and 8th month is Action Phase let's transform data accordingly

In [None]:
actionPhase_cols=df_high_val.columns[df_high_val.columns.str.contains('_6|_7',regex=True)]

In [None]:
actionPhase_cols_unique=actionPhase_cols.str[:-2].unique()
actionPhase_cols_unique

In [None]:
for col in actionPhase_cols_unique:
    df_high_val["Avg_"+col+'_GoodPhase']=(df_high_val[col+'_6']+df_high_val[col+'_7'])/2

In [None]:
df_high_val.head()

##### Dropping columns where the we have taken average for Good Phase

In [None]:
df_high_val.drop('total_avg_rech_amt_good_phase',1,inplace=True)
df_high_val.drop(actionPhase_cols,1,inplace=True)

In [None]:
df_high_val.shape

In [None]:
df_high_val.head()

In [None]:
print("Duplicates :",df_high_val.duplicated().sum())
print('Nulls:',(df_high_val.isnull().sum()).sum())

##### there are no duplicates and no Null values

#### Exploratory Data Analysis

##### for Exploratory Data analysis the features will be comapred in groups to analyse relationship between Good Phase and Action Phase

In [None]:
print(df_high_val.columns)

In [None]:
def bar_chart(df,features):
    df_churn=df[df.churn==1][features]
    df_nochurn=df[df.churn==0][features]
    pd.DataFrame([df_churn.mean()],index={'Churn'})
    pd.DataFrame([df_nochurn.mean()],index={'No Churn'})
    bar_chart=pd.concat([pd.DataFrame([df_churn.mean()],index={'Churn'}),pd.DataFrame([df_nochurn.mean()],index={'No Churn'})]).T
    bar_chart.plot.bar(figsize=(10,8),rot=90)
   

In [None]:
def plot_box_chart(attribute):
    plt.figure(figsize=(20,16))
    df = df_high_val
    plt.subplot(2,2,1)
    sns.boxplot(data=df, y='Avg_'+attribute+"_GoodPhase",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
    plt.subplot(2,2,2)
    sns.boxplot(data=df, y=attribute+"_8",x="churn",hue="churn",
                showfliers=False,palette=("plasma"))
    plt.show()

In [None]:
df_high_val.columns

In [None]:
df_analysis=df_info(df_high_val,'Total')

In [None]:
arpu_col=df_analysis.index[df_analysis.index.str.contains('arpu')]
onnet_col=df_analysis.index[df_analysis.index.str.contains('onnet')]
offnet_col=df_analysis.index[df_analysis.index.str.contains('offnet')]
rech_amt_col=df_analysis.index[df_analysis.index.str.contains('rech_amt')]
rech_data_col=df_analysis.index[df_analysis.index.str.contains('rech_data')]
monthly_2g_3g=df_high_val.columns[df_high_val.columns.str.contains('monthly_2g|monthly_3g')]
ic_mou=df_high_val.columns[df_high_val.columns.str.contains('.*ic.*mou_')]
og_mou=df_high_val.columns[df_high_val.columns.str.contains('.*og.*mou_')]
vbc_col=df_analysis.index[df_analysis.index.str.contains('vbc')]
sachet_col=df_analysis.index[df_analysis.index.str.contains('sachet')]
defaulter_col=sachet_col=df_analysis.index[df_analysis.index.str.contains('defaulter')]

##### Ploting Bar charts to analyze business bewteen Good Phase and Action Phase

In [None]:
bar_chart(df_high_val,arpu_col)

###### ARPU for potential Churn customer drops significantly but remains constant for Non Chrun customers.Though data indicates that Arpu for churn customer is slighty higher in Good Phase

In [None]:
bar_chart(df_high_val,onnet_col)

In [None]:
bar_chart(df_high_val,offnet_col)

##### Onnet and Offnet usages drops for Churn Customers but remain constant for Non Chrun

In [None]:
bar_chart(df_high_val,rech_amt_col)

In [None]:
bar_chart(df_high_val,rech_data_col)

###### Reachrage Data and Reharge Amount goes down in Action Phase for potential churn customers

In [None]:
bar_chart(df_high_val,monthly_2g_3g)

###### There is sharp decline in Monthly 2G and 3G data in Action Phase , though graph shows slight drop for Non Churn customers as well but Churn customer are dropping sigificantly¶

In [None]:
bar_chart(df_high_val,ic_mou)

###### There is a decline in incoming  in Action Phase , Significant drops in loc_ic,std_ic and total_ic in Action Phase

In [None]:
bar_chart(df_high_val,og_mou)

###### Std outgoing and Total outgoing shows significant drops in Chrun Customers. Interseting fact is that Potential churn customers use Total Out going and STD outgoing more than action phase and suddnly drop to a greater level 

In [None]:
bar_chart(df_high_val,vbc_col)

In [None]:
bar_chart(df_high_val,sachet_col)

###### There are clear indications on drop in VBC and 2G and 3G sachets usages

In [None]:
bar_chart(df_high_val,defaulter_col)

##### Interesting insight: Good Phase indicates that Non Churners are higher in default in good phase but in Action phase value of defaulters comes down but Chruners will default more. in good phase it may be due to payment delays

#### Age on Network's impact on Churn need to be analysed

In [None]:
df_tenure=df_high_val.copy()

In [None]:
df_tenure['Tenure_Months']=round(df_tenure['aon']/30,2)

In [None]:
df_tenure.Tenure_Months.value_counts()

In [None]:
sns.distplot(df_tenure.Tenure_Months,hist=True,kde=False)

In [None]:
tenure=[0,6,12,24,36,48,60,61]
tenure_range=['New','0.5-1 Year','1-2 Years','2-3 Years','3-4 Years','4-5 Years','5 years +']
df_tenure['Age_on_network']=pd.cut(df_tenure['Tenure_Months'],tenure,labels=tenure_range)
df_tenure.head()

In [None]:
100*df_tenure.Age_on_network.value_counts(normalize=True)

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='Age_on_network',hue='churn',data=df_tenure)

###### Age on Network for Chrun Customers is highest in 1-2 year bucket. Which translate into most of customers churns gets dis-satisfied with services within a span of 1-2 years

##### Let's analyse all numerical columns for Outliers

In [None]:
check_outliers=list(df_high_val.columns)


In [None]:
check_outliers.remove('churn')


In [None]:
check_outliers.remove('mobile_number')

In [None]:
check_outliers

In [None]:
for col in check_outliers:
    uni_anaysis(df_high_val,col)

### The group of charts shows that most of numeric features conatins outliers, Skewness and low distribution between churn and No chrun

In [None]:
from scipy import stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
def skew_check(df,cols):
    skew_check=df[cols].apply(lambda x:skew(x)).sort_values(ascending=False)
    skew_check_high =skew_check[skew_check>0.2]
    skewness = pd.DataFrame({'Skew' :skew_check_high})
    return skewness

In [None]:
skew_check(df_high_val,check_outliers)

In [None]:
df_high_val.describe()

##### Trying for different methods to remove Skew

In [None]:
df_log_transform=df_high_val.copy()
df_log_transform[check_outliers]=np.log1p(df_log_transform[check_outliers])

In [None]:
df_log_transform.head()

In [None]:
#  for i in check_outliers:
#         uni_anaysis(df_log_transform,i,False)

In [None]:
skew_check(df_log_transform,check_outliers)

In [None]:
df_box_cox=df_high_val.copy()

In [None]:
for col in check_outliers:
    df_box_cox[col] = boxcox1p(df_box_cox[col], boxcox_normmax(df_box_cox[col] + 1))

In [None]:
skew_check(df_box_cox,check_outliers)

In [None]:
 for i in check_outliers:
        uni_anaysis(df_box_cox,i,False)

###### Box Cox Transformation works out to be better than Log transformation

In [None]:
df_high_val[check_outliers]=df_box_cox[check_outliers]

## Model Building

#### Test Train Split

In [None]:
from sklearn.model_selection import train_test_split
X=df_high_val.drop(['churn','mobile_number'],1)
y=df_high_val[['churn']]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=100)

In [None]:
print("X_train Shape",X_train.shape)
print("y_train Shape",y_train.shape)
print("X_test Shape",X_test.shape)
print("y_test Shape",y_test.shape)

In [None]:
X_train.head()


###### Data is having various scales which need to be fitted within an uniform Scale, Using Standard Scaler for the same

In [None]:
feature_col=X_train.columns
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train[feature_col]=scaler.fit_transform(X_train[feature_col])
X_test[feature_col]=scaler.transform(X_test[feature_col])

In [None]:
X_train.head(2)

In [None]:
X_test.head(2)

##### As we have seen earlier data was imbalanced, lets check for Imbalance in target 

In [None]:
dist=100*y.churn.value_counts(normalize=True)

In [None]:
dist

In [None]:
dist.plot.pie(autopct='%.2f%%',labels=['No Churn','Churn'],figsize=(6,6))
plt.show()

###### Using Sampling techniques to correct Imbalance in target

In [None]:
def imbalance_check(X_train,y_train):
    print('After Imbalance Correction, the shape of train_X: {}'.format(X_train.shape)) 
    print('After Imbalance Correction, the shape of train_y: {} \n'.format(y_train.shape)) 

    print("After Imbalance Correction, counts of label '1': {}".format(sum(y_train.churn == 1))) 
    print("After Imbalance Correction, counts of label '0': {}".format(sum(y_train.churn == 0))) 

In [None]:
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
def sampling_fit(model):
    check=True
    print("Before Imbalance Correction, counts of label '1': {}".format(sum(y_train.churn == 1))) 
    print("Before Imbalance Correction, counts of label '0': {} \n".format(sum(y_train.churn == 0))) 
    if model=="SMOTE":
        sampler = SMOTE(random_state = 512) 
    elif model=='SMOTEEN':
        sampler=SMOTEENN(random_state=512)
    elif model=='ROS':
        sampler=RandomOverSampler(random_state=512)
    elif model=='RUS':
        sampler=RandomUnderSampler(random_state=512)
    X_train_sampled, y_train_sampled = sampler.fit_sample(X_train, y_train) 
    imbalance_check(X_train_sampled,y_train_sampled)
    return X_train_sampled,y_train_sampled
   

In [None]:
X_train_smote,y_train_smote=sampling_fit("SMOTE")

In [None]:
from sklearn.decomposition import PCA,IncrementalPCA
pca = PCA(svd_solver='randomized', random_state=512)
pca.fit(X_train_smote)
fig = plt.figure(figsize = (8,6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()



In [None]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

###### around 30 compents explains 90% variance , Considering 30 components for model

In [None]:
pca = IncrementalPCA(n_components=30)    
X_train_pca = pca.fit_transform(X_train_smote)
X_test_pca = pca.transform(X_test)
print("Shape of train datatset after PCA : "+str(X_train_pca.shape))

### Logistic Regression Model

In [None]:
results=pd.DataFrame(columns=['Accuracy','Precision','Recall','F1_score'])

In [None]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
def model_fit(model,X,y,predict_on):
    model.fit(X,y)
    model_predict=model.predict(predict_on)
    
    return model_predict
def metrics_cal(y_actual,y_predict):
    accuracy=metrics.accuracy_score(y_actual,y_predict)
    precision=metrics.precision_score(y_actual,y_predict)
    recall=metrics.recall_score(y_actual,y_predict)
    f1_score=metrics.f1_score(y_actual,y_predict)
    cm=metrics.confusion_matrix(y_actual,y_predict)
    sns.heatmap(cm,annot=True,fmt='d',xticklabels=['No Churn','Churn'],yticklabels=['No Churn','Churn'],cmap='Greens')
    plt.ylabel('True labels')
    plt.xlabel('Predicted labels')
    plt.show()
    
    
    return(accuracy,precision,recall,f1_score)
def confusion_matrix(cm):
    sns.heatmap(cm,annot=True,fmt='d',xticklabels=['No Churn','Churn'],yticklabels=['No Churn','Churn'],cmap='Greens')
    plt.show()
    

##### Simple Logistic Regression on Train

In [None]:
lr_model=LogisticRegression()
lr_predict_train=model_fit(lr_model,X_train_smote,y_train_smote,X_train_smote)

In [None]:
results.loc['Logistic Regression Train']=metrics_cal(y_train_smote,lr_predict_train)

In [None]:
results

##### Simple Logistic Regression on Test

In [None]:
lr_predict_test=model_fit(lr_model,X_train_smote,y_train_smote,X_test)
results.loc['Logistic Regression Test']=metrics_cal(y_test,lr_predict_test)

In [None]:
results

##### Simple Logistic Regression on Train with PCA

In [None]:
lr_predict_PCA_train=model_fit(lr_model,X_train_pca,y_train_smote,X_train_pca)
results.loc['Logistic Regression PCA Train']=metrics_cal(y_train_smote,lr_predict_PCA_train)

In [None]:
results

##### Logistic Regression with PCA on Test

In [None]:
lr_predict_PCA_test=model_fit(lr_model,X_train_pca,y_train_smote,X_test_pca)
results.loc['Logistic Regression PCA Test']=metrics_cal(y_test,lr_predict_PCA_test)

In [None]:
results

In [None]:
from sklearn.model_selection import GridSearchCV,KFold

In [None]:
#Comment

# lr=LogisticRegression(n_jobs=-1,random_state=512)
# param_l=['l1','l2']
# param_c=[1,10,100,1000]
# lrGrid=GridSearchCV(estimator=lr,param_grid=dict(C=param_c,penalty=param_l),scoring='roc_auc',n_jobs=10,cv=5,verbose=2)
# lrGrid=lrGrid.fit(X_train_smote,y_train_smote)
# lrGrid.best_params_

In [None]:
#Comment
# cv_results = pd.DataFrame(lrGrid.cv_results_)
# col_C = 'param_C'
# col_l = 'param_penalty'
# fig, ax = plt.subplots(1, 1, figsize=(11, 8))
# sns.pointplot(x=col_C, y='mean_test_score',  data=cv_results, ci=99, n_boot=64, ax=ax)
# ax.set_title("CV Grid Search Results")
# ax.set_xlabel(col_C)
# ax.set_ylabel('mean_test_score')


In [None]:
lr=LogisticRegression(penalty='l2',C=10,random_state=512)
lr_predict_hyper_train=model_fit(lr,X_train_smote,y_train_smote,X_train_smote)
results.loc['Logistic Regression Hyper Train']=metrics_cal(y_train_smote,lr_predict_hyper_train)

In [None]:
results

In [None]:
lr_predict_hyper_test=model_fit(lr,X_train_smote,y_train_smote,X_test)
results.loc['Logistic Regression Hyper Test']=metrics_cal(y_test,lr_predict_hyper_test)

In [None]:
results

In [None]:
lr_predict_hyperPCA_train=model_fit(lr,X_train_pca,y_train_smote,X_train_pca)
results.loc['Logistic Regression Hyper PCA Train']=metrics_cal(y_train_smote,lr_predict_hyperPCA_train)

In [None]:
lr_predict_hyperPCA_test=model_fit(lr,X_train_pca,y_train_smote,X_test_pca)
results.loc['Logistic Regression Hyper PCA Test']=metrics_cal(y_test,lr_predict_hyperPCA_test)

In [None]:
results

In [None]:
print(metrics.classification_report(y_test,lr_predict_hyperPCA_test))

##### Logistic Regression using RFE for feature reduction

In [None]:
from sklearn.feature_selection import RFE
lr=LogisticRegression()
rfe=RFE(lr,30)
rfe=rfe.fit(X_train_smote,y_train_smote)

In [None]:
list(zip(X_train_smote.columns,rfe.support_,rfe.ranking_))

In [None]:
col=X_train_smote.columns[rfe.support_]
col

In [None]:
X_train_rfe=X_train_smote[col]
X_test_rfe=X_test[col]

In [None]:
lr_RFE=LogisticRegression(penalty='l2',C=10,random_state=512)
lr_predict_RFE_train=model_fit(lr_RFE,X_train_rfe,y_train_smote,X_train_rfe)
results.loc['Logistic Regression RFE 30 Train']=metrics_cal(y_train_smote,lr_predict_RFE_train)

In [None]:
lr_predict_RFE_test=model_fit(lr_RFE,X_test_rfe,y_test,X_test_rfe)
results.loc['Logistic Regression RFE 30 Test']=metrics_cal(y_test,lr_predict_RFE_test)

In [None]:
results


##### RFE has improved overall accuracy to 94.5% and Precision to 71% but Recall is down to 49%. For this problem False Negative is not recommended means we need to have a good Recall score

##### Now we will try some non linear models to evaluate how we can get a better recall score 

##### Random Forrest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(random_state=512,n_estimators=100,n_jobs=-1)
rf_predict_train=model_fit(rf,X_train_smote,y_train_smote,X_train_smote)
results.loc['Random Forest  Train']=metrics_cal(y_train_smote,rf_predict_train)

In [None]:
rf_predict_test=model_fit(rf,X_train_smote,y_train_smote,X_test)
results.loc['Random Forest  Test']=metrics_cal(y_test,rf_predict_test)

In [None]:
print(metrics.classification_report(y_test,rf_predict_test))

In [None]:
results

In [None]:
metrics.plot_roc_curve(rf,X_train_smote,y_train_smote)

##### Train in Random Forest is overfitting

In [None]:
metrics.plot_roc_curve(rf,X_test,y_test)

##### Random forest with Hyper Parameter Tunning

In [None]:
#Comment
# params = {'max_features': ['auto','sqrt'] ,
#          'max_depth' : [20,30] , 
#          'min_samples_leaf':[50,100],
#          'min_samples_split':[100,150]
         
#         }
# grid_rf = GridSearchCV(estimator = RandomForestClassifier(random_state = 100,n_jobs = -1), scoring = 'recall', cv=3,
#                                                           param_grid = params,verbose=2)
# grid_rf.fit(X_train_smote,y_train_smote)
# print("Random Forest Best Score : " ,grid_rf.best_score_)
# print("Random Forest Best Params : " ,grid_rf.best_params_)

In [None]:
rf=RandomForestClassifier(max_depth= 30, max_features= 'auto', min_samples_leaf= 50, min_samples_split= 100,random_state=512,n_jobs=-1)

In [None]:
# #Comment
# params = {'n_estimators':[100,300,500]}
# grid_rf = GridSearchCV(estimator = RandomForestClassifier(random_state = 100,n_jobs = -1), scoring = 'recall', cv=3,
#                                                           param_grid = params,verbose=2)
# grid_rf.fit(X_train_smote,y_train_smote)
# print("Random Forest Best Score : " ,grid_rf.best_score_)
# print("Random Forest Best Params : " ,grid_rf.best_params_)

In [None]:
rf=RandomForestClassifier(n_estimators=300,max_depth= 30, max_features= 'auto', min_samples_leaf= 50, min_samples_split= 100,random_state=512,n_jobs=-1)

In [None]:
# #Comment
# cv_results = pd.DataFrame(lrGrid.cv_results_)
# col_C = 'param_C'
# col_l = 'param_penalty'
# fig, ax = plt.subplots(1, 1, figsize=(11, 8))
# sns.pointplot(x=col_C, y='mean_test_score',  data=cv_results, ci=99, n_boot=64, ax=ax)
# ax.set_title("CV Grid Search Results")
# ax.set_xlabel(col_C)
# ax.set_ylabel('mean_test_score')


In [None]:
rf_predict_Hyper_train=model_fit(rf,X_train_smote,y_train_smote,X_train_smote)
results.loc['Random Forest Hyper Train']=metrics_cal(y_train_smote,rf_predict_Hyper_train)

In [None]:
results

In [None]:

rf_predict_Hyper_test=model_fit(rf,X_train_smote,y_train_smote,X_test)
results.loc['Random Forest Hyper Test']=metrics_cal(y_test,rf_predict_Hyper_test)

In [None]:
results

In [None]:
metrics.plot_roc_curve(rf,X_test,y_test)

In [None]:
print(metrics.classification_report(y_test,rf_predict_Hyper_test))

In [None]:
# #Comment
# from sklearn.ensemble import AdaBoostClassifier
# params = {'n_estimators':[100,300,500]}
# grid_rf = GridSearchCV(estimator = AdaBoostClassifier(random_state= 100,learning_rate=1), scoring = 'recall', cv=3,
#                                                           param_grid = params,verbose=2)
# grid_rf.fit(X_train_smote,y_train_smote)
# print("AdaBoost Best Score : " ,grid_rf.best_score_)
# print("AdaBoost Best Params : " ,grid_rf.best_params_)

In [None]:
# #Comment
# params = {'learning_rate':[0.05,.1,0.5,1]}
# grid_rf = GridSearchCV(estimator = AdaBoostClassifier(random_state= 100,n_estimators=500), scoring = 'recall', cv=3,
#                                                           param_grid = params,verbose=2)
# grid_rf.fit(X_train_smote,y_train_smote)
# print("AdaBoost Best Score : " ,grid_rf.best_score_)
# print("AdaBoost Best Params : " ,grid_rf.best_params_)

In [None]:
# from sklearn.ensemble import AdaBoostClassifier
# ada=AdaBoostClassifier(n_estimators=500,learning_rate=1,random_state=512)
# ada_predict_Hyper_train=model_fit(ada,X_train_smote,y_train_smote,X_train_smote)
# results.loc['ADA Hyper train']=metrics_cal(y_train_smote,ada_predict_Hyper_train)

In [None]:
# ada_predict_Hyper_test=model_fit(ada,X_train_smote,y_train_smote,X_test)
# results.loc['ADA Hyper test']=metrics_cal(y_test,ada_predict_Hyper_test)

##### XGB Boost with Hyper Tuned Parameters

In [None]:
from xgboost import XGBClassifier
xgb=XGBClassifier(learning_rate=0.01,n_estimators=300,n_jobs=-1)

In [None]:
xgb.get_params

In [None]:
# param_xgb={'learning_rate':[0.01,0.05,0.1],
#           'reg_alpha':[1,5,10],
#           'reg_lambda':[1,3,5],
#           'gamma':[0.1,0.5,0.7]}

# xgb = XGBClassifier(learning_rate=0.01, 
#                           n_estimators=300, 
#                           n_jobs=-1)
                            
# grid_xgb=GridSearchCV(xgb,param_grid=param_xgb,scoring='recall',cv=3,verbose=2,n_jobs=-1)
# grid_xgb.fit(X_train_smote,y_train_smote)
# print("Best Score :XGB",grid_xgb.best_score_)
# print("Best Parameters :XGB",grid_xgb.best_params_)

In [None]:

xgb = XGBClassifier(learning_rate=0.05, 
                          n_estimators=300, 
                          max_depth=4, 
                          min_child_weight=10, 
                          gamma=0.1,
                          nthread=4, 
                          subsample=0.4,
                          colsample_bytree=0.6,
                          reg_alpha=1,
                          reg_lambda=3)


In [None]:
XGB_predict_Hyper_train=model_fit(xgb,X_train_smote,y_train_smote,X_train_smote)
results.loc['XGB Hyper train']=metrics_cal(y_train_smote,XGB_predict_Hyper_train)

In [None]:

XGB_predict_Hyper_test=model_fit(xgb,X_train_smote,y_train_smote,X_test)
results.loc['XGB Hyper test']=metrics_cal(y_test,XGB_predict_Hyper_test)

In [None]:
results

#### let's analyse Test Results only 

In [None]:
drop_index=results[results.index.str.contains('Train|train')].index

In [None]:
results.drop(drop_index,0,inplace=True)

In [None]:
results.sort_values(by=['Recall','Accuracy'],ascending=False)

#### Best Accuracy is from Random Forest with Hypertunning with 0.78 Recall and 0.44 Precision on SMOTE. The business problem is identify churn, means higher the Recall is better but Precisoin should also be better as Company may not to spend huge amount on False Positive. The purpose is to find model with a balance between higher Recall,Accuracy and Precision

We will try other Sampling methods like SMOTEEN, Random Over/Under Sampling to see if any of the model shows better results based on criteria stated above

##### Trying Random Over Sampling in Random Forest, XGB, RFE30 and Adaboost

In [None]:
X_train_RUS,y_train_RUS=sampling_fit("RUS")

In [None]:
lr_RFE_predict_test=model_fit(lr_RFE,X_train_RUS,y_train_RUS,X_test)
results.loc['LR RFE 30 RUS Test']=metrics_cal(y_test,lr_RFE_predict_test)

In [None]:
rf_predict_RUS_test=model_fit(rf,X_train_RUS,y_train_RUS,X_test)
results.loc['RF RUS Test']=metrics_cal(y_test,rf_predict_RUS_test)

In [None]:
# ada_predict_RUS_test=model_fit(ada,X_train_RUS,y_train_RUS,X_test)
# results.loc['ADA RUS Test']=metrics_cal(y_test,ada_predict_RUS_test)

In [None]:
xgb_predict_RUS_test=model_fit(xgb,X_train_RUS,y_train_RUS,X_test)
results.loc['XGB RUS Test']=metrics_cal(y_test,xgb_predict_RUS_test)

In [None]:
results

##### Performing the analysis with ROS

In [None]:
X_train_ROS,y_train_ROS=sampling_fit("ROS")
lr_RFE_predict_test=model_fit(lr_RFE,X_train_ROS,y_train_ROS,X_test)
results.loc['LR RFE 30 ROS Test']=metrics_cal(y_test,lr_RFE_predict_test)

In [None]:
rf_predict_ROS_test=model_fit(rf,X_train_ROS,y_train_ROS,X_test)
results.loc['RF ROS Test']=metrics_cal(y_test,rf_predict_ROS_test)

In [None]:
# ada_predict_ROS_test=model_fit(ada,X_train_ROS,y_train_ROS,X_test)
# results.loc['ADA ROS Test']=metrics_cal(y_test,ada_predict_ROS_test)

In [None]:
xgb_predict_ROS_test=model_fit(xgb,X_train_ROS,y_train_ROS,X_test)
results.loc['XGB ROS Test']=metrics_cal(y_test,xgb_predict_ROS_test)

In [None]:
results

##### Trying with SMOTEEN Sampling

In [None]:
X_train_smoteen,y_train_smoteen=sampling_fit("SMOTEEN")

In [None]:
lr_RFE_predict_test=model_fit(lr_RFE,X_train_smoteen,y_train_smoteen,X_test)
results.loc['LR RFE 30 Smoteen Test']=metrics_cal(y_test,lr_RFE_predict_test)

In [None]:
rf_predict_SMOTEEN_test=model_fit(rf,X_train_smoteen,y_train_smoteen,X_test)
results.loc['RF SMOTEEN Test']=metrics_cal(y_test,rf_predict_SMOTEEN_test)

In [None]:
# ada_predict_smoteen_test=model_fit(ada,X_train_smoteen,y_train_smoteen,X_test)
# results.loc['ADA Smoteen Test']=metrics_cal(y_test,ada_predict_smoteen_test)

In [None]:
xgb_predict_smoteen_test=model_fit(xgb,X_train_smoteen,y_train_smoteen,X_test)
results.loc['XGB Smoteen Test']=metrics_cal(y_test,xgb_predict_smoteen_test)

In [None]:
results.sort_values(by=["Recall","Accuracy","Precision"],ascending=False)

#### The Business Problem Highlights that the tolerance for False Negatives should be minimum but Precision should also have a fair balance so company avoid spenind too much funds on rolling out schemes towards False Positives. After considering Accuracy, Recall and Precision Balance the final model selected is XGB Hypetuned model with Random Over Sampling.

                   - Accuracy	Precision	Recall	F1_score
    XGB ROS Test   - 0.903183	0.427697	0.820852	0.562374

In [None]:
xgb_final=XGBClassifier(learning_rate=0.05, 
                          n_estimators=300, 
                          max_depth=4, 
                          min_child_weight=10, 
                          gamma=0.1,
                          nthread=4, 
                          subsample=0.4,
                          colsample_bytree=0.6,
                          reg_alpha=1,
                          reg_lambda=3)


In [None]:
xgb_predict_final=model_fit(xgb_final,X_train_ROS,y_train_ROS,X_test)


In [None]:
feature_imp = pd.DataFrame(sorted(zip(xgb_final.feature_importances_,X_test.columns)), columns=['Value','Feature'])

plt.figure(figsize=(15, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).head(10))
plt.title('Random Forest Features')
plt.tight_layout()
plt.show()