In [None]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

### Data

In [None]:
path = os.getcwd() + '/data/ex1data2.txt'
data2 = pd.read_csv(path, header=None, names=['Size', 'Bedrooms', 'Price'])
data2.head()

### Normalization and Data Engineering

In [None]:
data2 = (data2 - data2.mean()) / data2.std()
data2.head()

In [None]:
# adding ones column
data2.insert(0, 'Ones', 1)

# setting X (training data) and y (target variable)
cols = data2.shape[1]
X = data2.iloc[:,0:cols-1]
y = data2.iloc[:,cols-1:cols]

# converting to matrices and initializing theta
X = np.matrix(X.values)
y = np.matrix(y.values)
theta = np.matrix(np.array([0,0,0]))

In [None]:
data2.head()

## Linear Model 

### Cost

In [None]:
def computeCost(X, y, theta):
    inner = np.power(((X * theta.T) - y), 2) 
    return np.sum(inner) / (2 * len(X))

In [None]:
computeCost(X,y,theta)

### Gradient Descent

In [None]:
def gradientDescent(X,y,theta,alpha,iters):
    temp_2 = np.matrix(np.zeros(theta.shape[1]))
    weights = int(theta.ravel().shape[1]) 
    cost = np.zeros(iters)
    m = len(X)
    for i in range(iters):
        error = (X*theta.T - y)
        for j in range(weights):
            temp_1 = np.multiply(error, X[:,j])
            temp_2[0,j] = theta[0,j]  - (alpha/m)*np.sum(temp_1)
            
        theta = temp_2
        cost[i] = computeCost(X,y,theta)
    return theta, cost

### Model Fitting and Evaluation

In [None]:
alpha = 0.01
iters = 1500
theta,cost = gradientDescent(X,y,theta,alpha,iters)

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
ax.plot(np.arange(iters), cost, 'r')

ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')

# Scikit-Learn Implementation

In [None]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
data2.head()

In [None]:
X_1 = data2[['Size','Bedrooms']].values
y_1 = data2['Price'].values.reshape(-1,1)

In [None]:
X_1[0:5],y_1[0:5]

In [None]:
'''
Splitting into test and training sets
'''
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, train_size=0.7,test_size=0.3, random_state=0)

In [None]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape )

In [None]:
regr = linear_model.LinearRegression(normalize=True)
regr.fit(X_train,y_train)

In [None]:
y_pred = regr.predict(X_test)

In [None]:
# The coefficients
print('Coefficients: \n', regr.coef_)

In [None]:
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))


In [None]:
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))