# import preprocessed data

In [1]:
import numpy as np
import pandas as pd

In [2]:
data_preprocessed_logit = pd.read_csv('TelecomChurn_logistic.csv')

In [3]:
data_preprocessed_logit.head()

Unnamed: 0,Churn,RetentionCalls,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,...,Crafts,Homemaker,Other,Professional,Retired,Self,Student,AdjustmentInCreditRating,CustomerMakesRef,AcceptedRetentionOffer
0,0,0,-0.542005,-0.124806,-0.130676,-0.069489,-0.417536,-0.150615,1.017855,-0.029904,...,0,0,1,0,0,0,0,0,1,0
1,1,0,-0.707455,-0.962258,-0.783331,-0.380916,-0.417536,-0.150615,0.056912,0.026012,...,0,0,1,0,0,0,0,0,0,0
2,1,0,0.088292,-0.204927,0.086876,-0.069489,-0.034775,0.187043,-0.215949,-0.255998,...,0,0,0,0,0,0,0,0,0,0
3,1,0,2.356091,2.229215,0.521979,1.597069,2.97695,-0.150615,-0.635126,-0.52099,...,0,0,0,1,0,0,0,0,0,0
4,1,0,2.734813,0.28152,-0.783331,-0.380916,1.617141,8.881717,0.274409,-0.212237,...,0,0,1,0,0,0,0,0,0,0


# Segerate target and input data

In [4]:
data_targets = pd.DataFrame(data_preprocessed_logit['Churn'],columns=['Churn'])
data_inputs = data_preprocessed_logit.iloc[:,1:]

# Split data into training and test

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train, x_test,y_train,y_test = train_test_split(data_inputs,data_targets,test_size = 0.2, random_state = 20)

# Import sklearn library for logit regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix


In [8]:
reg = LogisticRegression()
result_log=reg.fit(x_train,y_train)
reg.score(x_train,y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7135202372978662

In [9]:
summary_table = pd.DataFrame (columns=['Feature name'], data = data_inputs.columns.values)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,RetentionCalls,0.289466
1,MonthlyRevenue,-0.261504
2,MonthlyMinutes,-0.086079
3,TotalRecurringCharge,0.171213
4,DirectorAssistedCalls,-0.009129
...,...,...
60,Self,-0.047542
61,Student,0.133245
62,AdjustmentInCreditRating,-0.140051
63,CustomerMakesRef,0.015698


In [10]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
pd.set_option('display.max_rows', summary_table.shape[0]+1)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,0.283309
1,RetentionCalls,0.289466
2,MonthlyRevenue,-0.261504
3,MonthlyMinutes,-0.086079
4,TotalRecurringCharge,0.171213
5,DirectorAssistedCalls,-0.009129
6,OverageMinutes,0.287695
7,RoamingCalls,0.079541
8,PercChangeMinutes,-0.163844
9,PercChangeRevenues,0.142546


In [11]:
reg.score(x_test,y_test)

0.7154611557596632

# Run model on test dataset 

In [12]:
predicted_proba = reg.predict_proba(x_test)
pred =predicted_proba[ :,1]


# Change probability cutoff to increase hit ratio. 
Increasing hit ratio will affect precision. We will need to tradeoff based on business requirement

In [13]:
for i in range(pred.shape[0]):
    if pred[i] >= 0.50:
        pred[i] = int(1)
    else:
        pred[i] = int(0)

In [14]:
pred

array([0., 0., 0., ..., 0., 1., 1.])

In [15]:
y_test['Prediction'] = pred
y_test['Actual'] = y_test['Churn']
df = y_test.iloc[:,1:]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Prediction,Actual
19807,0.0,0
9601,0.0,0
18725,0.0,0
3953,0.0,0
24502,0.0,1
...,...,...
10626,1.0,1
13018,1.0,1
19429,0.0,0
12062,1.0,1


In [16]:
confusion_matrix = pd.crosstab(df['Actual'], df['Prediction'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Predicted   0.0   1.0
Actual               
0          1891   704
1           783  1848


In [17]:
confusion_matrix

Predicted,0.0,1.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1891,704
1,783,1848


# Recall
Out of total actual churn what percent of churn we identify correctly 

In [18]:
recall = confusion_matrix.loc[1,1]/(confusion_matrix.loc[1,1]+confusion_matrix.loc[1,0])
print('\n Recall: {0:.2f}%'.format(recall*100))


 Recall: 70.24%


# Precision
Out of total predicted churn what percent of customers actually churned

In [19]:
Precision = confusion_matrix.loc[1,1]/(confusion_matrix.loc[1,1]+confusion_matrix.loc[0,1])
print('\n Precision: {0:.2f}%'.format(Precision*100))


 Precision: 72.41%
