In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### SGD Implementation Reference:
https://towardsdatascience.com/implementing-sgd-from-scratch-d425db18a72c

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import sys
import matplotlib

### Data

CRIM: per capita crime rate by town

ZN: proportion of residential land zoned for lots over 25,000 sq.ft.

INDUS: proportion of non-retail business acres per town

CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)

NOX: nitric oxides concentration (parts per 10 million)

RM: average number of rooms per dwelling

AGE: proportion of owner-occupied units built prior to 1940

DIS: weighted distances to ﬁve Boston employment centers

RAD: index of accessibility to radial highways

TAX: full-value property-tax rate per $10,000

PTRATIO: pupil-teacher ratio by town 12. B: 1000(Bk−0.63)2 where Bk is the proportion of blacks by town 13. 

LSTAT: % lower status of the population

MEDV: Median value of owner-occupied homes in $1000s

We can see that the input attributes have a mixture of units.

In [None]:
path = '/kaggle/input/boston-house-prices/housing.csv'
header_names=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 
              'DIS', 'RAD', 'TAX', 'PTRATION', 'B', 'LSTAT', 'PRICE']
df = pd.read_csv(path, names=header_names, delim_whitespace=True)
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
x = df.loc[:, 'ZN':'LSTAT']
y = df.loc[:, 'PRICE']
x.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [None]:
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
X_train = pd.DataFrame(scalar.fit_transform(X_train), columns=header_names[1:-1], index=X_train.index)
X_test = pd.DataFrame(scalar.transform(X_test), columns=header_names[1:-1], index=X_test.index)

In [None]:
X_train

In [None]:
X_train['price'] = y_train
X_test['price'] = y_test
X_train.shape, X_test.shape

In [None]:
X_train

### Stochastic Gradient Descent

In [None]:
from sklearn.metrics import mean_squared_error

def my_sgd(X_train, learning_rate=0.2, n_epochs=1000, sample_size=40):
    n_cols = X_train.shape[1]
    columns = X_train.columns.values
    w = np.random.randn(n_cols - 1)
    b = np.random.randn(1)
    
    for epoch in range(n_epochs):
        # Get Sample
        sample = X_train.sample(sample_size)
        x = sample.loc[:, columns[:-1]].values
        y = sample.loc[:, columns[-1]].values
        
        loss = 0
        
        y_pred = []
        sq_loss = []
        
        for i in range(sample_size):
            p = x[i]
            q = y[i]
            
            # calculate error
            pred = np.dot(x[i], w.T) + b
            error = y[i] - pred
            
            # changes
            lw = (-2 * x[i] * error) / sample_size
            lb = (-2 * error) / sample_size
            
            # update
            w = w - learning_rate * lw
            b = b - learning_rate * lb
            
            # predict new
            pred_new = np.dot(x[i], w.T)
            y_pred.append(pred_new)
        
        # loss
        loss = mean_squared_error(y_pred, y)
        
        # print
        print(f'epoch: {epoch}, loss: {loss:.03f}')
        
        # learning rate decay
        learning_rate /= 1.02
        
    return w, b

In [None]:
w, b = my_sgd(X_train)

In [None]:
def predict(X, w, b):
    y_pred = []
    y_actual = []
    columns = X.columns.values
    for i in range(len(X)):
        sample = X.loc[:, columns[:-1]].values
        sample_y = X.loc[:, columns[:-1]].values
        x = sample[i]
        pred = np.asscalar(np.dot(x, w.T) + b)
        y_pred.append(pred)
        
    return np.array(y_pred)

y_pred_test = predict(X_test, w, b)
y_pred_test

In [None]:
y_test_actual = X_test.loc[:, X_test.columns.values[-1]].values
y_test_actual

In [None]:
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt

plt.figure(figsize=(25,6))
plt.plot(y_test_actual, label='Actual')
plt.plot(y_pred_test, label='Predicted')
plt.legend(prop={'size': 16})
plt.show()
print('Mean Squared Error :',mean_squared_error(y_test_actual, y_pred_test))