In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score

## Read data

In [3]:
df = pd.read_csv('data/Social_Network_Ads.csv')

In [4]:
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [11]:
df_dummies = pd.get_dummies(df,columns=["Gender"])
df_dummies

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,15624510,19,19000,0,0,1
1,15810944,35,20000,0,0,1
2,15668575,26,43000,0,1,0
3,15603246,27,57000,0,1,0
4,15804002,19,76000,0,0,1
...,...,...,...,...,...,...
395,15691863,46,41000,1,1,0
396,15706071,51,23000,1,0,1
397,15654296,50,20000,1,1,0
398,15755018,36,33000,0,0,1


In [29]:
X = df_dummies[["Age","EstimatedSalary","Gender_Female","Gender_Male"]]
Y = df_dummies[["Purchased"]]

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0
...,...
395,1
396,1
397,1
398,0


In [31]:
X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size=0.3, random_state=0)
print(X_train.shape)
print(X_test.shape)

(280, 4)
(120, 4)


In [25]:
sc = StandardScaler()
sc_x = sc.fit(X_train)
X_train_sc = sc_x.transform(X_train)
X_test_sc = sc_x.transform(X_test)

In [26]:
X_train_sc

array([[-1.1631724 , -1.5849703 , -0.99288247,  0.99288247],
       [ 2.17018137,  0.93098672, -0.99288247,  0.99288247],
       [ 0.0133054 ,  1.22017719,  1.00716855, -1.00716855],
       ...,
       [-0.18277423, -0.51496559, -0.99288247,  0.99288247],
       [-1.06513258, -0.45712749,  1.00716855, -1.00716855],
       [-1.1631724 ,  1.39369146,  1.00716855, -1.00716855]])

## Create model

In [28]:
model_logistic = LogisticRegression()
model_logistic.fit(X_train_sc,y_train)

  y = column_or_1d(y, warn=True)


In [32]:
model_logistic.coef_

array([[ 2.10175832,  1.07208739, -0.05714272,  0.05714272]])

In [33]:
model_logistic.intercept_

array([-1.01019503])

## Test Predict

In [35]:
y_pred = model_logistic.predict(X_test_sc)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

## Evaluation

In [37]:
confusion_mat = confusion_matrix(y_test,y_pred)
confusion_mat

array([[74,  5],
       [ 8, 33]], dtype=int64)

In [39]:
precision = precision_score(y_test,y_pred)
precision

0.868421052631579

In [40]:
xt = X_test_sc[0,:] # x1,x2,x3,x4
xt

array([-0.77101313,  0.49720103, -0.99288247,  0.99288247])

In [43]:
model_logistic.coef_ #b1,b2,b3,b4

array([[ 2.10175832,  1.07208739, -0.05714272,  0.05714272]])

In [45]:
model_logistic.intercept_ # b0

array([-1.01019503])

In [46]:
2.108*(-0.77) + 1.07*0.50 + (-0.06)*(-0.99) + 0.06*0.99 + (-1.01)

-1.9793600000000002