# Implementing Logistic Regression 

### Importing all of the important libraries 

In [563]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score , precision_score , f1_score , recall_score


### Extracting data and converting it to a dataframe 

In [564]:
data = pd.read_csv('customer_behavior_train.csv')
data_test = pd.read_csv('customer_behavior_test.csv')

## Exploratory Data Analysis (EDA)

In [565]:
print(data.shape) # checking the shape of the data 
print(data.isnull().sum()) # checking all of the columns for null data 
print(data.head())
data.describe()

(1800, 8)
Time_on_site            0
Pages_viewed            0
Clicked_ad              0
Cart_value              0
Referral                0
Browser_Refresh_Rate    0
Last_Ad_Seen            0
Purchase                0
dtype: int64
   Time_on_site  Pages_viewed  Clicked_ad  Cart_value   Referral  \
0          2.29          5.37           1       10.00   Facebook   
1          7.49         16.36           0       25.52  Instagram   
2          5.71         13.69           0       10.00     Google   
3          1.23          3.22           1       41.49     Google   
4          3.55         10.72           0       21.57     Direct   

   Browser_Refresh_Rate Last_Ad_Seen  Purchase  
0                143.14            D         1  
1                136.66            A         1  
2                 67.13            C         0  
3                 92.12            C         0  
4                 70.31            B         0  


Unnamed: 0,Time_on_site,Pages_viewed,Clicked_ad,Cart_value,Browser_Refresh_Rate,Purchase
count,1800.0,1800.0,1800.0,1800.0,1800.0,1800.0
mean,4.962244,12.429483,0.301111,50.352933,100.6621,0.292222
std,2.336373,6.381941,0.458868,47.44339,51.645694,0.45491
min,0.5,0.76,0.0,10.0,-65.8,0.0
25%,3.3,7.86,0.0,13.8625,65.43,0.0
50%,4.935,12.02,0.0,34.69,99.735,0.0
75%,6.61,16.5,1.0,67.63,134.755,1.0
max,10.0,35.34,1.0,300.0,303.4,1.0


### Using one hot encoding for making referral and last ad seen binary types 

In [566]:
data_encoded = pd.get_dummies(data , columns=['Referral','Last_Ad_Seen'])
data_test_encoded = pd.get_dummies(data_test , columns=['Referral','Last_Ad_Seen'])
data_encoded.head()
data_encoded.info()
data_encoded.corr(numeric_only=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Time_on_site          1800 non-null   float64
 1   Pages_viewed          1800 non-null   float64
 2   Clicked_ad            1800 non-null   int64  
 3   Cart_value            1800 non-null   float64
 4   Browser_Refresh_Rate  1800 non-null   float64
 5   Purchase              1800 non-null   int64  
 6   Referral_Direct       1800 non-null   bool   
 7   Referral_Facebook     1800 non-null   bool   
 8   Referral_Google       1800 non-null   bool   
 9   Referral_Instagram    1800 non-null   bool   
 10  Last_Ad_Seen_A        1800 non-null   bool   
 11  Last_Ad_Seen_B        1800 non-null   bool   
 12  Last_Ad_Seen_C        1800 non-null   bool   
 13  Last_Ad_Seen_D        1800 non-null   bool   
dtypes: bool(8), float64(4), int64(2)
memory usage: 98.6 KB


Unnamed: 0,Time_on_site,Pages_viewed,Clicked_ad,Cart_value,Browser_Refresh_Rate,Purchase,Referral_Direct,Referral_Facebook,Referral_Google,Referral_Instagram,Last_Ad_Seen_A,Last_Ad_Seen_B,Last_Ad_Seen_C,Last_Ad_Seen_D
Time_on_site,1.0,0.906873,-0.019618,0.02156,0.01126,0.344128,-0.01286,0.011067,-0.003858,0.006417,0.017545,0.000294,-0.017985,0.000936
Pages_viewed,0.906873,1.0,-0.011645,0.005392,0.019562,0.358996,-0.001863,0.004423,-0.002036,-8.1e-05,0.000703,0.002575,-0.014221,0.011301
Clicked_ad,-0.019618,-0.011645,1.0,0.048734,0.008641,0.283907,-0.02298,0.013927,0.036608,-0.036613,-0.007918,-0.000285,-0.010494,0.018807
Cart_value,0.02156,0.005392,0.048734,1.0,-0.042083,0.137851,-0.011145,0.015416,0.028862,-0.040274,0.02614,-0.025513,0.019684,-0.020179
Browser_Refresh_Rate,0.01126,0.019562,0.008641,-0.042083,1.0,-0.005686,0.026053,0.030925,-0.064246,0.022617,0.002145,-0.0048,-0.026753,0.03014
Purchase,0.344128,0.358996,0.283907,0.137851,-0.005686,1.0,0.017338,0.017713,-0.028636,0.000424,-0.008731,0.016664,-0.026388,0.01887
Referral_Direct,-0.01286,-0.001863,-0.02298,-0.011145,0.026053,0.017338,1.0,-0.244344,-0.408783,-0.240946,-0.007184,-0.00044,-0.011859,0.019642
Referral_Facebook,0.011067,0.004423,0.013927,0.015416,0.030925,0.017713,-0.244344,1.0,-0.418245,-0.246523,0.009187,0.014733,-0.010044,-0.013512
Referral_Google,-0.003858,-0.002036,0.036608,0.028862,-0.064246,-0.028636,-0.408783,-0.418245,1.0,-0.412428,0.004502,0.010222,-0.012785,-0.001578
Referral_Instagram,0.006417,-8.1e-05,-0.036613,-0.040274,0.022617,0.000424,-0.240946,-0.246523,-0.412428,1.0,-0.007706,-0.027103,0.037787,-0.003951


In [567]:
print(data_encoded.shape) # checking the shape of the data 
print(data_encoded.isnull().sum()) # checking all of the columns for null data 
print(data_encoded.head())
data_encoded.describe()

(1800, 14)
Time_on_site            0
Pages_viewed            0
Clicked_ad              0
Cart_value              0
Browser_Refresh_Rate    0
Purchase                0
Referral_Direct         0
Referral_Facebook       0
Referral_Google         0
Referral_Instagram      0
Last_Ad_Seen_A          0
Last_Ad_Seen_B          0
Last_Ad_Seen_C          0
Last_Ad_Seen_D          0
dtype: int64
   Time_on_site  Pages_viewed  Clicked_ad  Cart_value  Browser_Refresh_Rate  \
0          2.29          5.37           1       10.00                143.14   
1          7.49         16.36           0       25.52                136.66   
2          5.71         13.69           0       10.00                 67.13   
3          1.23          3.22           1       41.49                 92.12   
4          3.55         10.72           0       21.57                 70.31   

   Purchase  Referral_Direct  Referral_Facebook  Referral_Google  \
0         1            False               True            False   
1

Unnamed: 0,Time_on_site,Pages_viewed,Clicked_ad,Cart_value,Browser_Refresh_Rate,Purchase
count,1800.0,1800.0,1800.0,1800.0,1800.0,1800.0
mean,4.962244,12.429483,0.301111,50.352933,100.6621,0.292222
std,2.336373,6.381941,0.458868,47.44339,51.645694,0.45491
min,0.5,0.76,0.0,10.0,-65.8,0.0
25%,3.3,7.86,0.0,13.8625,65.43,0.0
50%,4.935,12.02,0.0,34.69,99.735,0.0
75%,6.61,16.5,1.0,67.63,134.755,1.0
max,10.0,35.34,1.0,300.0,303.4,1.0


In [568]:

data['Referral'].value_counts()

Referral
Google       741
Facebook     360
Instagram    352
Direct       347
Name: count, dtype: int64

### Processing the data and dividing it to train and crossvalidation data and also refining the test data

In [None]:
# browser refresh rate was removed as it was of no use 
cols = ['Time_on_site','Pages_viewed','Clicked_ad','Cart_value','Referral_Direct','Referral_Facebook','Referral_Google','Referral_Instagram','Last_Ad_Seen_A','Last_Ad_Seen_B','Last_Ad_Seen_C','Last_Ad_Seen_D']
x = data_encoded[cols]
y = data_encoded['Purchase']

features_to_scale = ['Time_on_site','Pages_viewed','Cart_value']
preprocessor = ColumnTransformer([
    ('scale', StandardScaler(), features_to_scale)
], remainder='passthrough')

x_processed = preprocessor.fit_transform(x)

poly = PolynomialFeatures(degree=2, include_bias=False)
x_expanded = poly.fit_transform(x_processed)
x_expanded_pd = pd.DataFrame(x_expanded)





x_train , x_cv , y_train , y_cv = train_test_split(x_expanded_pd,y,test_size=0.2,random_state=1)
x_unprocessed_test = data_test_encoded[cols]
x_processed_test = preprocessor.fit_transform(x_unprocessed_test)
x_expanded_test = poly.fit_transform(x_processed_test)
x_test = pd.DataFrame(x_expanded_test)

y_test = data_test_encoded['Purchase']
x.head()

Unnamed: 0,Time_on_site,Pages_viewed,Clicked_ad,Cart_value,Referral_Direct,Referral_Facebook,Referral_Google,Last_Ad_Seen_A,Last_Ad_Seen_B,Last_Ad_Seen_C,Last_Ad_Seen_D
0,2.29,5.37,1,10.0,False,True,False,False,False,False,True
1,7.49,16.36,0,25.52,False,False,False,True,False,False,False
2,5.71,13.69,0,10.0,False,False,True,False,False,True,False
3,1.23,3.22,1,41.49,False,False,True,False,False,True,False
4,3.55,10.72,0,21.57,True,False,False,False,True,False,False


## Making a model Along with regularization

In [570]:

model = LogisticRegression(penalty='l2', C=100 )
model.fit(x_train,y_train)

### Finding all of the predicted values lowering the threshold 

In [571]:
threshold = 0.4
y_proba_train = model.predict_proba(x_train)[:,1]
y_pred_train = (y_proba_train >= threshold).astype(int)
y_proba_cv = model.predict_proba(x_cv)[:,1]
y_pred_cv = (y_proba_cv >= threshold).astype(int)
y_proba_test = model.predict_proba(x_test)[:,1]
y_test_pred = (y_proba_test >= threshold).astype(int)


### Chekcing the Performance metrics of Cross-Validation and Train set 

In [572]:
# Calculate performance metrics 
print("Accuracy train:", accuracy_score(y_train, y_pred_train))
print("Precision train:", precision_score(y_train, y_pred_train))
print("Recall train:", recall_score(y_train, y_pred_train))
print("F1 Score train:", f1_score(y_train, y_pred_train))

# For cross Validation set
# Calculate performance metrics 
print("Accuracy cross - validation :", accuracy_score(y_cv, y_pred_cv))
print("Precision cross - validation :", precision_score(y_cv, y_pred_cv))
print("Recall cross - validation :", recall_score(y_cv, y_pred_cv))
print("F1 Score cross - validation :", f1_score(y_cv, y_pred_cv))

Accuracy train: 0.7722222222222223
Precision train: 0.6081081081081081
Recall train: 0.6367924528301887
F1 Score train: 0.6221198156682027
Accuracy cross - validation : 0.7416666666666667
Precision cross - validation : 0.5384615384615384
Recall cross - validation : 0.6176470588235294
F1 Score cross - validation : 0.5753424657534246


### Chekcing the Performance Metrics of Test set 

In [573]:

# Calculate performance metrics 
print("Accuracy test:", accuracy_score(y_test, y_test_pred))
print("Precision test:", precision_score(y_test, y_test_pred))
print("Recall test:", recall_score(y_test, y_test_pred))
print("F1 Score test:", f1_score(y_test, y_test_pred))

Accuracy test: 0.74
Precision test: 0.5605095541401274
Recall test: 0.6470588235294118
F1 Score test: 0.6006825938566553


### Printing all of the coefficients of out model 

In [574]:
print(model.coef_)

[[ 0.61169661  0.52060385  0.1590805   0.95203941  0.06346179 -0.04797688
  -0.05588886 -0.19901131 -0.34515039 -0.50135932 -0.18522995 -0.13181039
   0.53801655  0.07775941 -1.00425366  0.0853557   0.12476786  0.16739863
   0.27293074  0.36125588 -0.34192166  0.31943165 -0.31762744 -0.21522696
   0.28788083 -0.44039077  0.42092623 -0.11562922  0.16817674  0.07304574
   0.46014209 -0.18076072 -0.00186492 -0.04030668  0.42866455  0.03031302
   0.40171861  0.09874622 -0.08120526 -0.01645329  0.15799283  0.95203941
  -0.20827226 -0.14257427 -0.49776249 -0.35586934  0.34671217  0.48219882
   0.47899775  0.06346179  0.          0.         -0.02412546  0.0853329
   0.76823604 -0.76598168 -0.04797688  0.          0.39889108 -0.20872219
   0.05080458 -0.28895035 -0.05588886 -0.18810849  0.10109626  0.30235547
  -0.27123211 -0.19901131  0.          0.          0.         -0.34515039
   0.          0.         -0.50135932  0.         -0.18522995]]
