# Prediction Using Logistic Regression

#Problem Statement:The dataset contains information like personal information of customers,transaction information,and the bank information belonging to a bank.It is often necessary to predict when the customers are going to withdraw their money from the bank account and stay dormant.Being able to predict this,the bank can take necessary action to prevent customers from withdrawing huge sums and stay an active/loyal customer.Our task is to be able to predict the customers who are going to churn based on the information given.

In [101]:
#Importing libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as lr
classifier=lr(class_weight='balanced')

In [102]:
pwd

'C:\\Users\\P . D YALMAR'

In [103]:
#importing data
data=pd.read_csv('churn_prediction.csv')

In [104]:
data.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,...,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,last_transaction
0,1,2101,66,Male,0.0,self_employed,187.0,2,755,1458.71,...,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0,2019-05-21
1,2,2348,35,Male,0.0,self_employed,,2,3214,5390.37,...,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0,2019-11-01
2,4,2194,31,Male,0.0,salaried,146.0,2,41,3913.16,...,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0,NaT
3,5,2329,90,,,self_employed,1020.0,2,582,2291.91,...,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1,2019-08-06
4,6,1579,42,Male,2.0,self_employed,1494.0,3,388,927.72,...,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1,2019-11-03


In [105]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28382 entries, 0 to 28381
Data columns (total 21 columns):
customer_id                       28382 non-null int64
vintage                           28382 non-null int64
age                               28382 non-null int64
gender                            27857 non-null object
dependents                        25919 non-null float64
occupation                        28302 non-null object
city                              27579 non-null float64
customer_nw_category              28382 non-null int64
branch_code                       28382 non-null int64
current_balance                   28382 non-null float64
previous_month_end_balance        28382 non-null float64
average_monthly_balance_prevQ     28382 non-null float64
average_monthly_balance_prevQ2    28382 non-null float64
current_month_credit              28382 non-null float64
previous_month_credit             28382 non-null float64
current_month_debit               28382 non-null

In [106]:
#checking null values present or not
data.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
last_transaction                     0
dtype: int64

In [107]:
#Handling categorical features
data["gender"] = data["gender"].map({'Male':0,'Female':1})
data["occupation"] = data["occupation"].map({'self_employed':0,'salaried':1})

In [108]:
data.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,...,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,last_transaction
0,1,2101,66,0.0,0.0,0.0,187.0,2,755,1458.71,...,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0,2019-05-21
1,2,2348,35,0.0,0.0,0.0,,2,3214,5390.37,...,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0,2019-11-01
2,4,2194,31,0.0,0.0,1.0,146.0,2,41,3913.16,...,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0,NaT
3,5,2329,90,,,0.0,1020.0,2,582,2291.91,...,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1,2019-08-06
4,6,1579,42,0.0,2.0,0.0,1494.0,3,388,927.72,...,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1,2019-11-03


In [109]:
#filling null values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
data['gender'] = imputer.fit_transform(data['gender'].values.reshape(-1,1))
data['occupation'] = imputer.fit_transform(data['occupation'].values.reshape(-1,1))
data['city'] = imputer.fit_transform(data['city'].values.reshape(-1,1))
data['dependents'] = imputer.fit_transform(data['dependents'].values.reshape(-1,1))

In [110]:
data.isnull().sum()

customer_id                       0
vintage                           0
age                               0
gender                            0
dependents                        0
occupation                        0
city                              0
customer_nw_category              0
branch_code                       0
current_balance                   0
previous_month_end_balance        0
average_monthly_balance_prevQ     0
average_monthly_balance_prevQ2    0
current_month_credit              0
previous_month_credit             0
current_month_debit               0
previous_month_debit              0
current_month_balance             0
previous_month_balance            0
churn                             0
last_transaction                  0
dtype: int64

### Datetime Data Type

In [111]:
## creating an instance(date) of DatetimeIndex class using "last_transaction"
date = pd.DatetimeIndex(data['last_transaction'])

In [112]:
# extracting new columns from "last_transaction"

# last day of year when transaction was done
data['day'] = date.dayofyear

# week of year when last transaction was done
data['week_of_year'] = date.weekofyear

# month of year when last transaction was done
data['month'] = date.month

# day of week when last transaction was done
data['day_of_week'] = date.dayofweek

In [113]:
data[['last_transaction','day','week_of_year','month','day_of_week']].head()

Unnamed: 0,last_transaction,day,week_of_year,month,day_of_week
0,2019-05-21,141.0,21.0,5.0,1.0
1,2019-11-01,305.0,44.0,11.0,4.0
2,NaT,,,,
3,2019-08-06,218.0,32.0,8.0,1.0
4,2019-11-03,307.0,44.0,11.0,6.0


In [114]:
data.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,...,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,last_transaction,day,week_of_year,month,day_of_week
0,1,2101,66,0.0,0.0,0.0,187.0,2,755,1458.71,...,0.2,0.2,1458.71,1458.71,0,2019-05-21,141.0,21.0,5.0,1.0
1,2,2348,35,0.0,0.0,0.0,1020.0,2,3214,5390.37,...,5486.27,100.56,6496.78,8787.61,0,2019-11-01,305.0,44.0,11.0,4.0
2,4,2194,31,0.0,0.0,1.0,146.0,2,41,3913.16,...,6046.73,259.23,5006.28,5070.14,0,NaT,,,,
3,5,2329,90,0.0,0.0,0.0,1020.0,2,582,2291.91,...,0.47,2143.33,2291.91,1669.79,1,2019-08-06,218.0,32.0,8.0,1.0
4,6,1579,42,0.0,2.0,0.0,1494.0,3,388,927.72,...,588.62,1538.06,1157.15,1677.16,1,2019-11-03,307.0,44.0,11.0,6.0


In [115]:
data['day'] = imputer.fit_transform(data['day'].values.reshape(-1,1))
data['week_of_year'] = imputer.fit_transform(data['week_of_year'].values.reshape(-1,1))
data['month'] = imputer.fit_transform(data['month'].values.reshape(-1,1))
data['day_of_week'] = imputer.fit_transform(data['day_of_week'].values.reshape(-1,1))

In [116]:
data = data.drop(columns = ['last_transaction'])

In [117]:
X = data.drop(columns=['churn'])
Y = data['churn']

In [118]:
#Splitting the data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=101)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((19867, 23), (8515, 23), (19867,), (8515,))

In [119]:
#If getting error like,ValueError: Input contains NaN, infinity or a value too large for dtype('float32') or dtype('float64')
a=np.nan_to_num(x_train)
b=np.nan_to_num(y_train)

In [120]:
classifier.fit(a,b)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(class_weight='balanced')

In [121]:
classifier.coef_

array([[-1.68835067e-06, -1.23748963e-04, -2.05140406e-05,
        -3.79843916e-07,  9.26353614e-07, -3.18249529e-07,
         4.61760713e-05,  3.09226255e-07,  1.04658871e-04,
        -1.26577328e-04,  6.36351586e-06,  1.62399288e-04,
        -8.71880272e-06, -6.15923766e-06, -8.17078270e-07,
         1.64356178e-05,  1.29637460e-05, -2.09553578e-06,
        -4.51388054e-05,  6.87980016e-05,  1.66399700e-05,
         2.67793905e-06,  1.88247020e-06]])

In [122]:
x=np.nan_to_num(x_test)
y=np.nan_to_num(y_test)

In [123]:
#Making Predictions
prediction=classifier.predict(x)
classifier.score(x,y)
prediction

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [124]:
#Checking accuracy
classifier.score(x, y)

0.7929536112742219