Tow approaches for linear regresssion: 1. Gradient Descent (Batch, Stochastic, Mini-Batch) used in many ML algorithms; 2. Normal Equation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# create a testing data pair
x = 4 * np.random.rand(20, 1)
y = x + 2 * np.random.rand(20, 1)
sns.scatterplot(list(x), list(y))

#### Gradient Descent approach
- hyphothesis $h_\theta$: $h_\theta = \theta_0 + \theta_1 \cdot x$ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;      *linear reg assumes the data fits to a straight line $y = a\cdot x + b$*
- cost function $J(\theta_0, \theta_1)$: $J(\theta_0, \theta_1) = \frac{1}{2m} \sum_{i=1}^{m} (h_\theta(x_i) - y_i)^2$  &nbsp; &nbsp;     *it is similar to the average of SSE (Sum of Squared Error) $SSE = \sum_{i=1}^{m}(y_i - \bar{y})^2$* **the difference is $h_\theta(x_i)$ replaced $\bar{y}$**, thus minimize cost function is the same as minimize the error, leading to the best fit
- gradient descent: $\theta_j = \theta_j - \alpha\cdot\frac{\partial}{\partial \theta_j}J(\theta_0, \theta_1)$
    - $\theta_j$ is the $j^{th}$ parameter
    - $\alpha$ is the learning rate
    - $\frac{\partial}{\partial \theta_j}J(\theta_0, \theta_1)$ is the partial derivative of the cost function (error) over the $j^{th}$ parameter, this term will become zero when all parameters are optimized.
    - each iteration we adjust $\theta_j$ according to the derivative by the scale of the learning rate $\alpha$
        - when $\alpha$ is too small, it can takes long time for the cost function value to decrease to minimal
        - when $\alpha$ is too big, the cost function value can pass the local minimal and swag around and fail to converge
- gradient descent after taking derivative: $\theta_j = \theta_j - \frac{\alpha}{m} \sum_{i=1}^{m} [(h_\theta (x_i) - y) x_i]$
- gradient descent break down to each parameter: \begin{align}
 \theta_0 & = \theta_0 - \alpha \frac{1}{k} \sum_{i=1}^{k} (h_\theta(x^{i}) - y^{i}) \\  
\theta_1 & = \theta_1 - \alpha \frac{1}{k} \sum_{i=1}^{k} ((h_\theta(x^{i}) - y^{i}) \cdot x^{i}) \\
\end{align}


In [None]:
# %%timeit
# iterative implementataion

h = lambda x: theta_0 + theta_1 * x # set hypothesis
theta_0 = theta_1 = 0 # initiate the parameters
lr = 0.05 # learning rate
epochs = 20
costs = []
paras = []

def cal_sum(h, x, y):
    sum0 = sum1 = 0
    for i in range(len(x)):
        sum0 += (h(x[i]) - y[i])
        sum1 += (h(x[i]) - y[i]) * x[i]
    return sum0, sum1

def cal_cost(h, x, y):
    j = 0
    for i in range(len(x)):
        j += (h(x[i]) - y[i]) ** 2
    return j / (2 * len(x))

for i in range(epochs):
    sum0, sum1 = cal_sum(h, x[:,0], y[:,0])
    theta_0 -= lr / len(x) * sum0
    theta_1 -= lr / len(x) * sum1
    paras.append((theta_0, theta_1))
    costs.append(cal_cost(h, x[:,0], y[:,0]))


In [None]:
# plt.scatter(x = range(len(costs)), y = costs)
plt.plot(costs, 'go-');

In [None]:
plt.plot(x, y, 'bo')
def plot_line(t0, t1, x):
    y = lambda x: t0 + t1*x
    x_values = [i for i in range(int(min(x))-1, int(max(x))+2)]
    y_values = [y(x) for x in x_values]
    color = list(np.random.random(size=3))
    plt.plot(x_values, y_values, c = color)
for t0, t1 in paras:
    plot_line(t0, t1, x)

In [None]:
X = np.concatenate((np.ones((len(x),1)),x), axis = 1) # adding constant term to each train data

In [None]:
# %%timeit

# implementation with linear algebra
# for quick review of linear algebra relavent to this part: https://www.holehouse.org/mlclass/03_Linear_algebra_review.html; https://www.youtube.com/watch?v=Dft1cqjwlXE&list=PLLssT5z_DsK-h9vYZkQkYNWcItqhlRJLN&index=13&t=0s 
def gradientDescent(X, y, theta, alpha, num_iters):
    """
       Performs gradient descent to learn theta
    """
    m = y.size  # number of training examples
    for i in range(num_iters):
        y_hat = np.dot(X, theta)
        theta = theta - alpha * (1.0/m) * np.dot(X.T, y_hat-y)
    return theta

theta = np.zeros((2,1))
epochs = 20 # run 20 single step gd to get cost functions and parameters to compare with the non-linear-algebra approach
gd_costs = []
gd_paras = []

for i in range(epochs):
    theta = gradientDescent(X, y, theta, 0.05, 1)
    gd_paras.append(theta)
    gd_costs.append(cal_cost(lambda x: theta[0] + theta[1]*x, x[:, 0], y[:, 0]))


Linear algebra has no speed advantage in this scenario (small # features and small # train)

In [None]:
plt.plot(gd_costs, 'go-')
plt.plot(costs, 'r+');

In [None]:
plt.plot(x, y, 'bo')

for t0, t1 in gd_paras:
    plot_line(t0, t1, x)

In [None]:
fig = plt.figure(figsize=(18,9))
plt.subplots_adjust(hspace=.5)

plt.subplot2grid((1,2), (0,0))
plt.plot(x, y, 'bo')

for t0, t1 in paras:
    plot_line(t0, t1, x)
plt.title('Iterative approach')

plt.subplot2grid((1,2), (0,1))
plt.plot(x, y, 'bo')

for t0, t1 in gd_paras:
    plot_line(t0, t1, x)
plt.title('Linear Algebra approach')

plt.show();

#### Normal Equation approach
- hyphothesis $h_\theta$: $h_\theta = \theta_0 + \theta_1 \cdot x$
- parameters $\theta = (X^T X)^{-1} X^T y$

In [None]:
X_T = np.transpose(X)
inverse = np.linalg.inv(np.dot(X_T,X))
theta = np.dot(np.dot(inverse, X_T), y)
print(theta)
print(cal_cost(lambda x: theta[0] + theta[1]*x, x[:, 0], y[:, 0]))

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/suicide-rates-overview-1985-to-2016/master.csv')
df.head()

In [None]:
print(df.shape)
print(df.country.unique(), df.country.nunique())

In [None]:
df.rename(columns={'suicides/100k pop': 'suicides_per_100k_pop',
                  ' gdp_for_year ($) ': 'gdp_for_year',
                  'gdp_per_capita ($)': 'gdp_per_capita'}, inplace=True)
df.head()

In [None]:
country_rate = df.groupby('country').suicides_per_100k_pop.mean().reset_index()

In [None]:
country_gdp_cap = df.groupby('country').gdp_per_capita.mean().reset_index()
new_df = pd.merge(country_rate, country_gdp_cap, on='country')
new_df

In [None]:
sns.scatterplot(data = new_df, x = 'gdp_per_capita', y = 'suicides_per_100k_pop');

In [None]:
sns.lmplot(data = new_df, x = 'gdp_per_capita', y = 'suicides_per_100k_pop');

Gradient Descent approach for linear regression
***************

In [None]:
x = new_df.gdp_per_capita/10000 # x turned to be to big that will send y values out of range for python to handle
y = new_df.suicides_per_100k_pop

In [None]:
# iterative implementation

h = lambda x: theta_0 + theta_1 * x # set hypothesis
theta_0 = theta_1 = 0 # initiate the parameters
lr = 0.05 # learning rate
epochs = 200
costs = []
paras = []

def cal_sum(h, x, y):
    sum0 = sum1 = 0
    for i in range(len(x)):
        sum0 += (h(x[i]) - y[i])
        sum1 += (h(x[i]) - y[i]) * x[i]
    return sum0, sum1

def cal_cost(h, x, y):
    j = 0
    for i in range(len(x)):
        j += (h(x[i]) - y[i]) ** 2
    return j / (2 * len(x))

for i in range(epochs):
    sum0, sum1 = cal_sum(h, x, y)
    theta_0 -= lr / len(x) * sum0
    theta_1 -= lr / len(x) * sum1
    paras.append((theta_0, theta_1))
    costs.append(cal_cost(h, x, y))

In [None]:
plt.plot(costs, 'go-');

In [None]:
plt.plot(x, y, 'bo')

for t0, t1 in paras:
    plot_line(t0, t1, x)

In [None]:
X = np.concatenate((np.ones((len(x),1)),pd.DataFrame(x)), axis = 1) # adding constant term to each train data

In [None]:
# implementation with linear algebra
# for quick review of linear algebra relavent to this part: https://www.holehouse.org/mlclass/03_Linear_algebra_review.html; https://www.youtube.com/watch?v=Dft1cqjwlXE&list=PLLssT5z_DsK-h9vYZkQkYNWcItqhlRJLN&index=13&t=0s 
def gradientDescent(X, y, theta, alpha, num_iters):
    """
       Performs gradient descent to learn theta
    """
    m = y.size  # number of training examples
    for i in range(num_iters):
        y_hat = np.dot(X, theta)
        theta = theta - alpha * (1.0/m) * np.dot(X.T, y_hat-pd.DataFrame(y))
    return theta

theta = np.zeros((2,1))
epochs = 200 # run 20 single step gd to get cost functions and parameters to compare with the non-linear-algebra approach
gd_costs = []
gd_paras = []

for i in range(epochs):
    theta = gradientDescent(X, y, theta, 0.05, 1)
    gd_paras.append(theta)
    gd_costs.append(cal_cost(lambda x: theta[0] + theta[1]*x, x, y))

In [None]:
plt.plot(gd_costs, 'go-')
plt.plot(costs, 'r+');

In [None]:
plt.plot(x, y, 'bo')

for t0, t1 in gd_paras:
    plot_line(t0, t1, x)

In [None]:
fig = plt.figure(figsize=(12,6))
plt.subplots_adjust(hspace=.5)

plt.subplot2grid((1,2), (0,0))
plt.plot(x, y, 'bo')

for t0, t1 in paras:
    plot_line(t0, t1, x)
plt.title('Iterative approach')

plt.subplot2grid((1,2), (0,1))
plt.plot(x, y, 'bo')

for t0, t1 in gd_paras:
    plot_line(t0, t1, x)
plt.title('Linear Algebra approach')

plt.show();

Normal Equation approach
source: https://medium.com/@dikshitkathuria1803/normal-equation-using-python-5993454fbb41
****************
$\theta = (X^T X)^{-1} (X^T y)$

In [None]:
x_transpose = np.transpose(X)   #calculating transpose
x_transpose_dot_x = x_transpose.dot(X)  # calculating dot product
temp_1 = np.linalg.inv(x_transpose_dot_x) #calculating inverse

temp_2 = x_transpose.dot(y)  

para = temp_1.dot(temp_2)
para

In [None]:
plt.plot(x, y, 'bo')
plot_line(para[0], para[1], x)