# In thise code we will solve a Linear Classification (Logistic Regression) model, using two approaches:
1. Direct optimization. In this approach we create the Cross Entropy Loss function, and solve it using Python's optimization packages.
2. Using a Python library to build a classification model.

In [25]:
import numpy as np
import pandas as pd

In [3]:
# this step is to read data on Google Drive
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Data shared on eLearning
income_data=pd.read_csv("drive/My Drive/Income_Data.csv")

In [8]:
# remove missing values - see Linear Model code for data processing
income_data = income_data[(income_data.Salary.notnull()) & (income_data.Age.notnull()) & (income_data["Years of Experience"].notnull())]

In [9]:
# check
income_data.shape

(6699, 6)

In [15]:
# Since we intend to run a binary classification model, we need binary target variable.
# To do so, we define a binary variable (Purchase) that shows whether customer made a purchase or not.
# Model's output would be the Probability of purchase conditional on Xs (here Age and Years of Experience).
# We use Salary column to simulate Purchase, and assume Purchase is 1 when Salary is higher than or equal to $180,000, and 0 otherwise.

income_data["Purchase"] = np.where(income_data.Salary >= 180000, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_data["Purchase"] = np.where(income_data.Salary >= 180000, 1, 0)


In [16]:
# check
income_data.tail(5)

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Purchase
6697,49,Female,PhD,Director of Marketing,20.0,200000.0,1
6698,32,Male,High School,Sales Associate,3.0,50000.0,0
6699,30,Female,Bachelor's Degree,Financial Manager,4.0,55000.0,0
6700,46,Male,Master's Degree,Marketing Manager,14.0,140000.0,0
6701,26,Female,High School,Sales Executive,1.0,35000.0,0


In [17]:
# Calculate Response (Purchase) rate
income_data[income_data.Purchase == 1].shape[0]/income_data.shape[0]

0.1767427974324526

In [19]:
# define matrix of features, and add a column of 1s for intercept
Xs = income_data[["Age", "Years of Experience"]]
new_column = np.ones(income_data.shape[0])
Xs.insert(loc=0, column='Intercept', value=new_column)

# define vector of target variable
Y = income_data.Purchase

In [27]:
# method 1: direct optimization

from scipy.optimize import least_squares as lsq
from scipy.optimize import minimize


# define Cross Entropy function to be minimized
def Cross_Entropy (Beta):
    exp_x_beta = np.exp(np.dot(Xs,Beta))
    pi = exp_x_beta/(1+exp_x_beta)
    each_row = Y*np.log(pi)+(1-Y)*np.log(1-pi)
    return np.sum(each_row)*(-1)

# use two optimization methods
lr_us_1 = minimize(Cross_Entropy , [0,0,0],method='BFGS')
lr_us_2 = lsq(Cross_Entropy , [0,0,0],method='dogbox')

# print coefficents
print(lr_us_1.x)
print(lr_us_2.x)

[-4.66801194  0.01281013  0.25276673]
[-0.05362522 -0.16406311  0.44008653]


In [28]:
# method 2 - SkLearn with 'lbfgs' solver
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs')
sk_lr1 = lr.fit(Xs, Y)

# print coefficents
sk_lr1.coef_

array([[1.57813939e-04, 1.28542096e-02, 2.52695169e-01]])

In [29]:
# method 2 - SkLearn with 'newton-cg' solver
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='newton-cg')
sk_lr2 = lr.fit(Xs,Y)

# print coefficents
sk_lr2.coef_

array([[-1.08738160e-05,  1.28540662e-02,  2.52697255e-01]])

In [50]:
# compare four methods; seems like lsq gives the lowest Loss

print (Cross_Entropy(sk_lr1.coef_[0]))
print (Cross_Entropy(sk_lr2.coef_[0]))
print (Cross_Entropy(lr_us_1.x))
print (Cross_Entropy(lr_us_2.x))

12465.853364795887
12465.129418855482
2126.6323886871414
2247.4956601030535
