In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_absolute_error


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h2> Calculates cost with regularization </h2>
<h2> Note that $ \theta_{0} $ is not included in the calculation of cost for regularization in the second summation</h2>
<h1> $$ \frac{1}{2m}( \sum_{i=1}^m (h_{\theta}(x^{(i)})-y^{(i)})^2) + \frac{\lambda}{2m} \sum_{j=1}^n \theta_j^2 $$ </h1>
<h3> where m = number of training examples, $y^{(i)}$ and $x^{(i)}$ are training example i, $\lambda$ is regularization constant </h3>
<h3> $$h_{\theta}(x^{(i)}) = \theta_{0}+\theta_{1}x^{(i)}_{1}+...+\theta_{n}x^{(i)}_{n}$$ </h3>

In [None]:
def calculate_cost(theta, X, y, lbd): # theta is dimensions n x 1, X is dimensions m x n, y is dimensions m x 1
    m = X.shape[0]
    cost = (1/(2*m))*(np.square((np.matmul(X,theta.transpose()) - y))).sum() + (lbd/(2*m)) * np.square(theta).sum() # vectorized implementation of mean squared error as cost function
    cost -= (lbd/(2*m)) * theta[0] ** 2 # theta zero is not regularized
    return cost

<h2> Vectorized implementation gradient calculation together with regularization </h2>
<h2> Note that for $\theta_{0}$, regularization is not needed </h2>
<h1> $$ \frac{\partial J}{\partial \theta_{j}} = \frac{1}{m} \sum_{i=1}^m (h_{\theta}(x^{(i)})-y^{(i)})x_{j}^{(i)} + \frac{\lambda}{m} \theta_{j}$$ </h1>
<h3> for j = 1,2,...,n where n is the number of weights theta, and m is the number of training examples </h3>

In [None]:
def calculate_grad(theta, X, y, lbd): # theta is dimensions n x 1, X is dimensions m x n, y is dimensions m x 1
    m = X.shape[0]
    grad = (1/m)*(np.matmul(np.matmul(X,theta.transpose()) - y,X) + lbd*theta) # vectorized implementation of gradient
    grad[0] -= (1/m)*lbd*theta[0] # theta_zero is not regularized
    return grad

In [None]:
def linear_reg(X_train, y_train, alpha, lbd, iterations, X_test=None, y_test=None): # X is dimensions m x n, y is dimensions m x 1, alpha is learning rate
    theta = np.random.rand(X_train.shape[1]) # randomly initiates weights
    m = X_train.shape[0]
    costs_train = []
    costs_test = []
    for i in range(iterations):
        costs_train.append(calculate_cost(theta, X_train, y_train, lbd))
        theta -= alpha * calculate_grad(theta, X_train ,y_train, lbd)
        if (X_test is not None and y_test is not None):
            costs_test.append(calculate_cost(theta, X_test, y_test, lbd))
    x_graph = np.arange(0,iterations,1);    
    plt.plot(x_graph,costs_train, label='train') 
    if (X_test is not None and y_test is not None):
        plt.plot(x_graph,costs_test, label='test')
    plt.legend()
    return theta  

In [None]:
def predict(X,theta):
    return np.matmul(X,theta.transpose())

In [None]:
def plot_cost(X, y, theta):
    pred = predict(X,theta)
    plt.scatter(X[:,1],y)
    plt.plot(X[:,1],pred)

In [None]:
def normalize(X, mean, std):
    return (X-mean) / std

<h2> Applying linear regression model on data from kaggle </h2>

In [None]:
df = pd.read_csv("/kaggle/input/real-estate-price-prediction/Real estate.csv",index_col=None)
display(df)
display(df.isnull().sum()) # check for nan values

In [None]:
train=df.sample(frac=0.9,random_state=0) #random state is a seed value
test=df.drop(train.index)

train_x = train.loc[:,df.columns != "Y house price of unit area"]
train_y = train.loc[:,"Y house price of unit area"]
test_x = test.loc[:,df.columns != "Y house price of unit area"]
test_y = test.loc[:,"Y house price of unit area"]

train_mean = train_x.mean(axis=0) # mean normalization
train_std = train_x.std(axis=0)
train_x = normalize(train_x,train_mean ,train_std)
test_x = normalize(test_x,train_mean ,train_std )

train_x.insert(0, 'One', 1) # adding column of ones for theta that is independent of features
test_x.insert(0, 'One', 1)

In [None]:
theta = linear_reg(train_x.values, train_y.values, 0.05, 0.001, 200, test_x.values, test_y.values)
theta

In [None]:
pred = predict(test_x.values, theta)
print(mean_absolute_error(pred,test_y))
print(max(test_y) - min(test_y))
print('r2 Score : ', r2_score(test_y, pred))

<h3> Using sklearn linear regression library </h3>

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit (train_x.values, train_y.values)
y_pred = reg.predict(test_x)
print('r2 Score : ', r2_score(test_y, y_pred))