# In this assignment, you will implement linear regression with one variable.
### 100 points total

In [399]:
# clear all variables
%reset_selective -f a
#import 
import numpy as np
import matplotlib.pyplot as plt
from numpy import loadtxt
%matplotlib qt

## Load data from data.csv file (5 Points)


In [400]:
# Complete the function below to load the data from data.csv. 
# Return [X,y] where X is the input and y is the target
def load_data(file_name):
    # write your code here
    X, y = np.loadtxt(file_name, dtype='float,float', delimiter=',', usecols=(0, 1), unpack=True)
    return X,y

In [401]:
[X,y] = load_data("data.csv")

### Visualise the data (5 Points)

In [402]:
def vis_data(X,y):
    # write your code here
    fig = plt.figure(figsize = (10,10))
    plt.scatter(X,y)
    plt.show()

In [403]:
vis_data(X,y)

## Implement a loss function (10 Points)

In [404]:
# y_true --> the target values.
# y_pred --> the predicted values
def loss(y_true, y_pred):    
    #Calculating loss.
    diff = y_pred - y_true
    loss = (1 / (2*len(y_pred))) * np.sum((diff)**2)
    return loss

## Test loss function

In [405]:
loss(np.array([5,2]),np.array([10,3]))

6.5

## Implement a function to calculate gradients (20 Points)

In [406]:
#Input:
# X --> Input.
# y_true --> target values.
# y_pred --> predictions.
#return:
# dw --> the gradient with respect to the weights
# db --> the gradient with respect to the bias.
def gradients(X, y_true, y_pred):
    # write your code here
    # calcualte the gradient
    dw = ( 1 / len(X) ) * np.dot(X.T, y_pred - y_true)
    db = ( 1 / len(X) ) * np.sum(y_pred - y_true)
    return dw, db

## Test gradients

In [407]:
dw,db = gradients(np.array([5]),np.array([1.5]),np.array([1.1]))
print(f'dw = {dw} , db = {db}')

dw = -1.9999999999999996 , db = -0.3999999999999999


## Write a function that uses your loss and gradients to train a LR model (25 Points)

In [408]:
# X --> Input.
# y --> true/target value.
# add more arguments as you need
def train(X, y, learning_rate=0.01, num_iterations=1000):
    # write your code here
    
    # set the default parameter
    w, b = 0, 0
    dif_loss = np.Infinity # using for stop iterations
    loss_func_cur = None
    cost_history = []
    
    print(f"Setting iterations: {num_iterations} and learning rate: {learning_rate}")
    
    for i in range(num_iterations):
        
        # get y_prediction
        y_pred = np.dot(X, w) + b
        
        # get loss
        loss_func = loss(y, y_pred)
        if loss_func_cur != None: 
            dif_loss = loss_func - loss_func_cur
        loss_func_cur = loss_func
        cost_history.append(loss_func_cur)
        
        # get Gradients
        dw,db = gradients(X,y,y_pred)
        
        # updated w, b
        w -= learning_rate * dw
        b -= learning_rate * db
        
        # visual the output
        if i%100 == 0:
            print(f"round {i}: {w,b} , loss = {loss_func}, diff loss = {dif_loss} ")
        
        # stop the iteration if results give the optimal likelihood
        if abs(dif_loss) < 1e-6:
            print(f"The maximum iterative round {i}: {w,b} , loss = {loss_func}, diff loss = {dif_loss} ")
            break
    # returning weights, bias and losses(List)
    return w, b, loss_func, cost_history, i, learning_rate

In [416]:
w, b, loss_func, cost_history, number_of_iterations, learning_rate  = train(X, y, 0.5,1000)

Setting iterations: 1000 and learning rate: 0.5
round 0: (1.846086771454969, 4.200808417490454) , loss = 36.47935578592967, diff loss = inf 
round 100: (-4.747664068332094, 10.719994637527323) , loss = 0.12072162089099499, diff loss = -0.00022021120151288764 
The maximum iterative round 179: (-4.988698254141816, 10.846755407273495) , loss = 0.11765032913298663, diff loss = -9.482975257546489e-07 


## Write a function to use your model to predict (15 Points)

In [417]:
def predict(X, w, b):
    # write your code here
    y_pred = np.dot(X, w) + b
    # Returning predictions.
    return y_pred

### Visualise your predictions

In [418]:
fig = plt.figure(figsize=(8,6))
plt.plot(X, y, 'y.')
plt.plot(X, predict(X, w, b), 'r.')
plt.legend(["Data", "Predictions"])
plt.xlabel('X - Input')
plt.ylabel('y - target / true')
plt.title('Regression')

Text(0.5, 1.0, 'Regression')

## Calculate the fit score

In [419]:
from sklearn.metrics import r2_score

In [420]:
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true)**2))

y_true = y
y_pred = predict(X, w, b)
model_accuracy_score = r2_score(y_true, y_pred)
model_RMSE_score = rmse(y_true, y_pred)

### Use scikit-learn to fit a linear regression model using the data from data,csv (20 points)

In [421]:
# write your code here
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
df_x = pd.DataFrame(X)
df_y = pd.DataFrame(y)

reg = LinearRegression().fit(df_x, df_y)

# Summary section to compare self-implemented and scikitlearn lib
print("Modeling with Scikit-learn - linear regression")
print(f" The training accuracy equal to {reg.score(df_x, df_y)}")
print(f" The training RMSE equal to {rmse(df_y,reg.predict(df_x))}")
print(f" The paramter: w = {reg.coef_} and b = {reg.intercept_}")

print("--------------------------------------------")
print("Modeling with pratical implemented linear regression ")
print(f"Setting iterations: {number_of_iterations} and learning rate: {learning_rate}")
print(f" The training accuracy equal to {model_accuracy_score}")
print(f" The training RMSE equal to {model_RMSE_score}")
print(f" The paramter: w = {w} and b = {b}")


Modeling with Scikit-learn - linear regression
 The training accuracy equal to 0.9007929514755091
 The training RMSE equal to 0    0.485051
dtype: float64
 The paramter: w = [[-5.00562638]] and b = [10.85565797]
--------------------------------------------
Modeling with pratical implemented linear regression 
Setting iterations: 179 and learning rate: 0.5
 The training accuracy equal to 0.9007824958284781
 The training RMSE equal to 0.4850761672651886
 The paramter: w = -4.988698254141816 and b = 10.846755407273495


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
