# Lab 04-2: Logistic Regression
## Exercise: Predicting Iris Species

### Prepare IRIS Dataset

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris

iris = load_iris()

# iris.data contains four column
#   sepal length (cm) / sepal width (cm) / petal length (cm) / petal width (cm)
# iris.target contains one column
#   species of (0,1,2) = (setosa, versicolor, virginica)
iris_df = pd.DataFrame(data= iris.data, columns= iris.feature_names)
iris_tf = pd.DataFrame(data= iris.target, columns= ['species'])

# remove virginica data
remove_virginica = True

# What will happen if we don't remove virginica data?
if remove_virginica:
    iris_df = iris_df.drop(labels=range(100,150), axis=0)
    iris_tf = iris_tf.drop(labels=range(100,150), axis=0)

# species are 0 for setosa, 1 for versicolor, and 2 for virginica
# let's find versicolor; meaning, set versicolor to 1 and set setosa to 0
def converter(species):
    if species == 1:
        return 1
    else:
        return 0

iris_tf['species'] = iris_tf['species'].apply(converter)

vX = iris_df.to_numpy()
vY = np.reshape(iris_tf.to_numpy(),-1)

### Presenting Dataset Samples

In [2]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,100.0,100.0,100.0,100.0
mean,5.471,3.099,2.861,0.786
std,0.641698,0.478739,1.449549,0.565153
min,4.3,2.0,1.0,0.1
25%,5.0,2.8,1.5,0.2
50%,5.4,3.05,2.45,0.8
75%,5.9,3.4,4.325,1.3
max,7.0,4.4,5.1,1.8


In [3]:
print(vY)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Splitting Data for Training and Testing

In [4]:
# We can use train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Splitting dataframe into train & test
X_train, X_test, y_train, y_test = train_test_split(vX, vY, test_size= 0.20, random_state= 101)

### Logistic Regression

$$h(x^{(i)}) = \text{sigmoid} (Wx^{(i)} + b), \qquad
J = -{1 \over n} \sum_{i=1}^{n} \left(y (Wx^{(i)} + b) - \log(1+e^{Wx^{(i)} + b}) \right)$$

$${\partial J \over \partial W} = {1 \over n} \sum_{i=1}^{n} \left(\left(y - h(x^{(i)})\right) \cdot x_j^{(i)}\right), \qquad
{\partial J \over \partial b} = {1 \over n} \sum_{i=1}^{n} \left(y - h(x^{(i)})\right)$$

Training Model with Logistic Regression

In [5]:
class myLogisticRegression:
    def __init__(self):
        self.wgt0 = 0.0
        self.wgt1 = 0.0
        self.wgt2 = 0.0
        self.wgt3 = 0.0
        self.bias = 0.0

In [None]:
# define functions
def sigmoid(x):
    return (1 / (1 + np.exp(-x)))
    
# weight for 4 input variables: sepal length, sepal width, petal length, petal width
m = myLogisticRegression()

# define learning rate & number of epochs
alpha = 0.001
n_epochs = 500

for epoch in range(n_epochs):
    ### Training Weights
    ### START CODE HERE ###

    y_lin  = m.wgt0 * X_train[:,0] + m.wgt1 * X_train[:,1] + m.wgt2 * X_train[:,2] + m.wgt3 * X_train[:,3] + m.bias    # Linear Prediction
    y_hat  = sigmoid(y_lin)    # Take Logistic Probability
    ydiff  = y_hat - y_train    # Find Differences
    m.wgt0 = m.wgt0 - alpha * np.mean(ydiff * X_train[:,0])    # Update Weights with Gradients
    m.wgt1 = m.wgt1 - alpha * np.mean(ydiff * X_train[:,1]) 
    m.wgt2 = m.wgt2 - alpha * np.mean(ydiff * X_train[:,2]) 
    m.wgt3 = m.wgt3 - alpha * np.mean(ydiff * X_train[:,3]) 
    m.bias = m.bias - alpha * np.mean(ydiff)     # Update Bias

    ### END CODE HERE ###

    # Print Loss Value
    if((epoch+1)%100==0):
        ### START CODE HERE ###

        y_lin  = m.wgt0 * X_train[:,0] + m.wgt1 * X_train[:,1] + m.wgt2 * X_train[:,2] + m.wgt3 * X_train[:,3] + m.bias    # Linear Prediction
        loss_J = -np.mean(y_train * y_lin - np.log(1 + np.exp(y_lin)))    # Calculate Loss J

        ### END CODE HERE ###
        print('Epoch: %5d,  loss: %10.8f' % (epoch+1, loss_J))

Epoch:   100,  loss: 0.63201186
Epoch:   200,  loss: 0.62505847
Epoch:   300,  loss: 0.62114083
Epoch:   400,  loss: 0.61769903
Epoch:   500,  loss: 0.61455430


Evaluate Model Performance

In [None]:
def my_predict(m, X_test):
    ### START CODE HERE ###

    y_lin  = m.wgt0 * X_test[:,0] + m.wgt1 * X_test[:,1] + m.wgt2 * X_test[:,2] + m.wgt3 * X_test[:,3] + m.bias    # Linear Prediction
    y_pred = sigmoid(y_lin)    # Find Probability

    ### END CODE HERE ###
    return y_pred

from sklearn.metrics import accuracy_score

def decision(x):
    x[x>=0.5]=1
    x[x<0.5]=0
    return (x.astype(np.int64))
    
y_prob = my_predict(m, X_test)
y_pred = decision(y_prob)
accuracy_score(y_pred, y_test)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


0.6

Logistic Regression from scikit-learn

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

# Training/Fitting the Model
lr.fit(X_train, y_train)

# Making Predictions
s_pred = lr.predict(X_test)

accuracy_score(s_pred, y_test)

0.6333333333333333

### Test Model with a random sample


In [None]:
idx = np.random.randint(X_test.shape[0])
test_in = np.expand_dims(X_test[idx], axis=0)

species = ['setosa', 'versicolor']

y_pred = decision(my_predict(m, test_in))
s_pred = lr.predict(test_in)

print('My prediction for Iris Species:', species[y_pred[0]])
print('SK prediction for Iris Species:', species[s_pred[0]])
print('Actual Iris Species:', species[y_test[idx]])

My prediction for Iris Species: setosa
SK prediction for Iris Species: setosa
Actual Iris Species: setosa
