In [1]:
import numpy as np

In [2]:
data = np.loadtxt(r"C:\Users\Saideep\Downloads\data.csv",delimiter = ",")

In [3]:
X = data[:,0] ## as We're implementing linear regression only for 1-d data without reshaping it will work
Y = data[:,1]
X.shape

(100,)

# Splitting the data into train and test

In [4]:
from sklearn import model_selection

In [5]:
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y);

# Completing fit, predict, score using the formulas

In [6]:
## fit
def fit(x_train,y_train):
    """
    From the sqaured error(i.e the cost function) we basically derivate it and equate to 0 to get the equation 
    of m and c parameters which are actually used to predict the test data.Using those equations we will return m and c
    """
    num = (x_train*y_train).mean() - x_train.mean()*y_train.mean()
    den = (x_train**2).mean() - x_train.mean()**2
    m = num/den
    c = y_train.mean() - m*x_train.mean()
    return m,c

In [7]:
## predict and score
def predict(x,m,c):
    """
    After we have the values of m, c we the linear line using y = m*x + c.
    As m, c are values which minimize the cost/error function they will produce the best fit line
    """
    return m*x + c

def score(y_truth,y_pred):
    """
    score is calculate by comparing the squared error of truth and predicted value with squared error of truth and its mean.
    This is to compare how worse the predicted value is compared to mean
    """
    u = ((y_truth - y_pred)**2).sum()
    v = ((y_truth - y_truth.mean())**2).sum()
    return 1 - u/v

In [8]:
## fitting the training data
m,c = fit(X_train,Y_train)
m,c

(1.2325947652722486, 12.23861788121021)

In [9]:
y_train_pred = predict(X_train,m,c)
print("Score on training data: ",score(Y_train,y_train_pred))
y_test_pred = predict(X_test,m,c)
print("Score on testing data: ",score(Y_test,y_test_pred))

Score on training data:  0.5618076971129533
Score on testing data:  0.6817755852745025


# Cost function

In [15]:
def cost(x,y,m,c):
    """
    Just implementing the cost function formulae which is 
    nothing but the mean of squared error
    usually it is ((y - (m*x + c))**2).sum() 
    can also be ((y - (m*x + c))**2).mean() not much of a difference 
    """
    return ((y - (m*x + c))**2).mean() 

In [16]:
print("Cost on training data: ",cost(X_train,Y_train,m,c))
print("Cost on testing data: ",cost(X_test,Y_test,m,c))

Cost on training data:  117.00511174050546
Cost on testing data:  93.14084852352524
