# Lab 04-1: Linear Regression
## Exercise: Predicting Sepal Length

### Prepare IRIS Dataset

In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris

iris = load_iris()

# iris.data contains four column
#   sepal length (cm) / sepal width (cm) / petal length (cm) / petal width (cm)
# iris.target contains one column
#   species of (0,1,2) = (setosa, versicolor, virginica)
iris_df = pd.DataFrame(data= iris.data, columns= iris.feature_names)
iris_tf = pd.DataFrame(data= iris.target, columns= ['species'])

# concatenate dataframe columns and make a deep copy (not a shallow copy. i.e., y= x)
iris_df = pd.concat([iris_df, iris_tf], axis= 1)

# Replace class number with Iris name, just for information
def converter(species):
    if species == 0:
        return 'setosa'
    elif species == 1:
        return 'versicolor'
    elif species == 2:
        return 'virginica'
    else:
        return 'Error'
# change species numbers to species names
iris_tf = iris_df.copy()
iris_tf['species'] = iris_tf['species'].apply(converter)

# separate sepal length column from the dataframe
vX = iris_df.drop(labels= 'sepal length (cm)', axis=1)
vY = iris_df['sepal length (cm)']

# Chnage dataset from pandas to numpy
vX = vX.to_numpy()
vY = vY.to_numpy()

### Presenting Dataset Samples

In [None]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [None]:
print(iris_tf)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                  5.1               3.5                1.4               0.2   
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1               1.8   

       species  
0       se

Splitting Data into Train and Test

In [None]:
# We can use train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Or we can define numpy version
def np_train_test_split(X_in, y_in, test_size=0.2, shuffle=True, random_state=42):
    test_num = int(X_in.shape[0] * test_size)
    train_num = X_in.shape[0] - test_num
    if shuffle:
        np.random.seed(random_state)
        shuffled = np.random.permutation(X_in.shape[0])
        X_in = X_in[shuffled,:]
        y_in = y_in[shuffled]
   
    X_train = X_in[:train_num]
    X_test = X_in[train_num:]
    y_train = y_in[:train_num]
    y_test = y_in[train_num:]
   
    return X_train, X_test, y_train, y_test

# Splitting dataframe into train & test
X_train, X_test, y_train, y_test = np_train_test_split(vX, vY, test_size= 0.20, random_state= 101)

### Linear Regression with MSE

$$\hat{y}^{(i)} = Wx^{(i)} + b, \qquad
J = {1 \over n} \sum_{i=1}^{n} (\hat{y}^{(i)} - y^{(i)})^2$$

$${\partial J \over \partial W} = {2 \over n} \sum_{i=1}^{n} (\hat{y}^{(i)} - y^{(i)}) \cdot x^{(i)}, \qquad
{\partial J \over \partial b} = {2 \over n} \sum_{i=1}^{n} (\hat{y}^{(i)} - y^{(i)})$$

Train Model with Linear Regression

In [None]:
class myLinearRegression:
    def __init__(self):
        self.wgt0 = 0.0
        self.wgt1 = 0.0
        self.wgt2 = 0.0
        self.wgt3 = 0.0
        self.bias = 0.0

In [27]:
# weight for 4 input variables: sepal width, petal length, petal width, species
m = myLinearRegression()

# define learning rate & number of epochs
alpha = 0.001
n_epochs = 10000

for epoch in range(n_epochs):
    ### START CODE HERE ###

    y_hat  = m.wgt0 * X_train[:,0] + m.wgt1 * X_train[:,1] + m.wgt2 * X_train[:,2] + m.wgt3 * X_train[:,3] + m.bias # Linear Prediction
    #np.array([m.wgt0 * X[0] + m.wgt1 * X[1] + m.wgt2 * X[2] + m.wgt3 * X[3] + m.bias for X in X_train])
    error  = y_hat - y_train    # Check Error
    m.wgt0 = m.wgt0 - 2 * alpha * np.mean(error * X_train[:,0])     # Update Weights with Gradients
    m.wgt1 = m.wgt1 - 2 * alpha * np.mean(error * X_train[:,1])
    m.wgt2 = m.wgt2 - 2 * alpha * np.mean(error * X_train[:,2])
    m.wgt3 = m.wgt3 - 2 * alpha * np.mean(error * X_train[:,3]) 
    m.bias = m.bias - 2 * alpha * np.mean(error)     # Update Bias

    ### END CODE HERE ###
    
    # Print MSE
    if((epoch+1)%1000==0):
        mean_sq_er = np.mean(error**2)
        print('Epoch: %5d,  MSE: %10.8f' % (epoch+1, mean_sq_er))

Epoch:  1000,  MSE: 0.14676027
Epoch:  2000,  MSE: 0.13882586
Epoch:  3000,  MSE: 0.13250673
Epoch:  4000,  MSE: 0.12729516
Epoch:  5000,  MSE: 0.12297630
Epoch:  6000,  MSE: 0.11937853
Epoch:  7000,  MSE: 0.11636458
Epoch:  8000,  MSE: 0.11382460
Epoch:  9000,  MSE: 0.11167050
Epoch: 10000,  MSE: 0.10983163


Evaluate Model Performance

In [28]:
def my_predict(m, X_test):
    ### START CODE HERE ###

    y_pred = np.array([m.wgt0 * X[0] + m.wgt1 * X[1] + m.wgt2 * X[2] + m.wgt3 * X[3] + m.bias 
              for X in X_test])

    ### END CODE HERE ###
    return y_pred

y_pred = my_predict(m, X_test)
p_err = y_pred - y_test
mean_sq_err = np.mean(p_err**2)

print('MSE: %10.8f' % (mean_sq_err))

MSE: 0.12616479


Linear Regression with LSE from scikit-learn

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression()

# Training/Fitting the Model
lr.fit(X_train, y_train)

# Making Predictions
s_pred = lr.predict(X_test)

print('MSE: %10.8f' % (mean_squared_error(y_test, s_pred)))

MSE: 0.10749214


### Test Model with a random sample


In [30]:
idx = np.random.randint(X_test.shape[0])
test_in = np.expand_dims(X_test[idx], axis=0)

y_pred = my_predict(m, test_in)
s_pred = lr.predict(test_in)

print('My prediction for Sepal Length (cm):', y_pred[0])
print('SK prediction for Sepal Length (cm):', s_pred[0])
print('Actual Sepal Length (cm):', y_test[idx])

My prediction for Sepal Length (cm): 4.842170259661631
SK prediction for Sepal Length (cm): 4.884739394466205
Actual Sepal Length (cm): 5.0
