# Linear Regression

Implementation of single variable linear regression from scratch

**Problem Statement**
Predict the price of a house given it's size

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy

In [2]:
# load training data
data_file = 'data.txt'
df = pd.read_csv(data_file, names=['size', 'no. of bedrooms', 'price'], sep=',')

In [None]:
print(f"data size: {len(df)}")
df.head()

In [None]:
def split_train_test(df, train_frac: float, seed=200):
    """
    splits the data into training and test data in the given required fraction of train data
    """
    train = df.sample(frac=train_frac, random_state=seed)
    test  = df.drop(train.index)
    return train, test

In [None]:
# split the data into training and testing set
train_data, test_data = split_train_test(df, 0.8)
print(f"training data - {len(train_data)}")
print(train_data.head())
print(f"\ntest data - {len(test_data)}")
print(test_data.head())

## Cost function

The cost function is used to get a sense of "how well our model is performing" <br>
It can be defined as:

<font size=4> J(**w**,b) = $\frac{1}{2m}$.$\sum_{i=1}^{m} (f_{\mathbf{w},b}(\mathbf{x}^{(i)}) - y^{(i)})^2$</font>

In [None]:
def compute_cost(X, y, w, b):
    m = X.shape[0]
    
    cost = 0
    for i in range(m):
        f_wb_i = np.dot(w, X[i]) + b
        cost = cost + (f_wb_i - y[i]) ** 2
    
    return cost * 0.5 / m

In [None]:
X, y = train_data['size'].to_numpy().reshape(-1, 1) / 1000, train_data['price'].to_numpy() / 1000
w_initial = np.zeros((1,))
b_initial = 0
cost = compute_cost(X, y, w_initial, b_initial)
print(f"with w={w_initial} b={b_initial} \n{cost=:.4f}")

In [None]:
# plot training data
train_df = train_data[['size','price']] / 1000

plt.scatter(train_df['size'], train_df['price'], c='r', marker='x', label='Actual Values')

# Set the title
plt.title("Housing Prices")
# Set the y-axis label
plt.ylabel('Price (in 1000s of dollars)')
# Set the x-axis label
plt.xlabel('Size (1000 sqft)')
plt.legend()
plt.show()

## Gradient Descent

In [None]:
def compute_gradient(X, y, w, b):
    m, n = X.shape
    
    dj_dw = np.zeros((n,))
    dj_db = 0
    
    for i in range(m):
        err = np.dot(w, X[i]) + b - y[i]
        
        for j in range(n):
            dj_dw[j] += err * X[i,j]
            
        dj_db += err
    
    dj_dw /= m
    dj_db /= m
    
    return dj_db, dj_dw

In [None]:
def gradient_descent(X, y, w_initial, b_initial, cost_function, gradient_function, alpha, num_iter):
    w = copy.deepcopy(w_initial)
    b = b_initial
    j_history = []
    
    for i in range(num_iter):
        dj_db, dj_dw = gradient_function(X, y, w, b)
        
        w = w - alpha * dj_dw
        b = b - alpha * dj_db
        
        if i < 10000000:
            j_history.append(cost_function(X, y, w, b))
        
        if i % (num_iter // 10) == 0:
            print(f"Iteration {i:4d}: Cost {j_history[-1]:8.2f}")
    
    return w, b, j_history

In [None]:
w, b, j_history = gradient_descent(X, y, w_initial, b_initial, compute_cost, compute_gradient, 5.0e-6, 1000000)

In [None]:
def predict(x, w, b):
    return np.dot(x, w) + b

In [None]:
# plot cost versus iteration  
fig, (ax1, ax2) = plt.subplots(1, 2, constrained_layout=True, figsize=(12,4))
ax1.plot(j_history[:100])
ax2.plot(1000 + np.arange(len(j_history[1000:])), j_history[1000:])
ax1.set_title("Cost vs. iteration(start)");  ax2.set_title("Cost vs. iteration (end)")
ax1.set_ylabel('Cost')            ;  ax2.set_ylabel('Cost') 
ax1.set_xlabel('iteration step')  ;  ax2.set_xlabel('iteration step') 
plt.show()

In [None]:
# plot training data and the model
train_df = train_data[['size','price']] / 1000

plt.plot(train_df['size'], train_df['size'] * w[0] + b, label='Predicted Values')
plt.scatter(train_df['size'], train_df['price'], c='r', marker='x', label='Actual Values')

# Set the title
plt.title("Housing Prices")
# Set the y-axis label
plt.ylabel('Price (in 1000s of dollars)')
# Set the x-axis label
plt.xlabel('Size (1000 sqft)')
plt.legend()
plt.show()

In [None]:
def compute_accuracy(X, y, w, b):
    accuracy = 0
    m = X.shape[0]
    
    for i in range(m):
        accuracy += 1 - abs(X[i].dot(w) + b - y[i]) / y[i]
    
    return accuracy / m

In [None]:
X = test_data['size'].to_numpy().reshape(-1, 1) / 1000
y = test_data['price'].to_numpy() / 1000
print(f'accuracy={compute_accuracy(X, y, w, b) * 100:.2f}%')