In [None]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit

# Pretty display for notebooks
%matplotlib inline

# Load the Boston housing dataset
data = pd.read_csv('../input/bostonhoustingmlnd/housing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)
    
# Success
print ("Boston housing dataset has {} data points with {} variables each.".format(*data.shape))

In [None]:
###########################################
# Suppress matplotlib user warnings
# Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
#
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
###########################################

import matplotlib.pyplot as pl
import numpy as np
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit, train_test_split

def ModelLearning(X, y):
    """ Calculates the performance of several models with varying sizes of training data.
        The learning and testing scores for each model are then plotted. """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)

    # Generate the training set sizes increasing by 50
    train_sizes = np.rint(np.linspace(1, X.shape[0]*0.8 - 1, 9)).astype(int)

    # Create the figure window
    fig = pl.figure(figsize=(10,7))

    # Create three different models based on max_depth
    for k, depth in enumerate([1,3,6,10]):

        # Create a Decision tree regressor at max_depth = depth
        regressor = DecisionTreeRegressor(max_depth = depth)

        # Calculate the training and testing scores
        sizes, train_scores, test_scores = learning_curve(regressor, X, y, \
            cv = cv, train_sizes = train_sizes, scoring = 'r2')

        # Find the mean and standard deviation for smoothing
        train_std = np.std(train_scores, axis = 1)
        train_mean = np.mean(train_scores, axis = 1)
        test_std = np.std(test_scores, axis = 1)
        test_mean = np.mean(test_scores, axis = 1)

        # Subplot the learning curve
        ax = fig.add_subplot(2, 2, k+1)
        ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score')
        ax.plot(sizes, test_mean, 'o-', color = 'g', label = 'Testing Score')
        ax.fill_between(sizes, train_mean - train_std, \
            train_mean + train_std, alpha = 0.15, color = 'r')
        ax.fill_between(sizes, test_mean - test_std, \
            test_mean + test_std, alpha = 0.15, color = 'g')

        # Labels
        ax.set_title('max_depth = %s'%(depth))
        ax.set_xlabel('Number of Training Points')
        ax.set_ylabel('Score')
        ax.set_xlim([0, X.shape[0]*0.8])
        ax.set_ylim([-0.05, 1.05])

    # Visual aesthetics
    ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad = 0.)
    fig.suptitle('Decision Tree Regressor Learning Performances', fontsize = 16, y = 1.03)
    fig.tight_layout()
    fig.show()


def ModelComplexity(X, y):
    """ Calculates the performance of the model as model complexity increases.
        The learning and testing errors rates are then plotted. """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)

    # Vary the max_depth parameter from 1 to 10
    max_depth = np.arange(1,11)

    # Calculate the training and testing scores
    train_scores, test_scores = validation_curve(DecisionTreeRegressor(), X, y, \
        param_name = "max_depth", param_range = max_depth, cv = cv, scoring = 'r2')

    # Find the mean and standard deviation for smoothing
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot the validation curve
    pl.figure(figsize=(7, 5))
    pl.title('Decision Tree Regressor Complexity Performance')
    pl.plot(max_depth, train_mean, 'o-', color = 'r', label = 'Training Score')
    pl.plot(max_depth, test_mean, 'o-', color = 'g', label = 'Validation Score')
    pl.fill_between(max_depth, train_mean - train_std, \
        train_mean + train_std, alpha = 0.15, color = 'r')
    pl.fill_between(max_depth, test_mean - test_std, \
        test_mean + test_std, alpha = 0.15, color = 'g')

    # Visual aesthetics
    pl.legend(loc = 'lower right')
    pl.xlabel('Maximum Depth')
    pl.ylabel('Score')
    pl.ylim([-0.05,1.05])
    pl.show()


def PredictTrials(X, y, fitter, data):
    """ Performs trials of fitting and predicting data. """

    # Store the predicted prices
    prices = []

    for k in range(10):
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, \
            test_size = 0.2, random_state = k)

        # Fit the data
        reg = fitter(X_train, y_train)

        # Make a prediction
        pred = reg.predict([data[0]])[0]
        prices.append(pred)

        # Result
        print("Trial {}: ${:,.2f}".format(k+1, pred))

    # Display price range
    print("\nRange in prices: ${:,.2f}".format(max(prices) - min(prices)))

In [None]:
# Minimum price of the data
minimum_price = np.min(prices)

# Maximum price of the data
maximum_price = np.max(prices)

# Mean price of the data
mean_price = np.mean(prices)

# Median price of the data
median_price = np.median(prices)

# Standard deviation of prices of the data
std_price = np.std(prices)

# Calculated statistics
print ("Statistics for Boston housing dataset:\n")
print ("Minimum price: ${:,.2f}".format(minimum_price))
print ("Maximum price: ${:,.2f}".format(maximum_price))
print ("Mean price: ${:,.2f}".format(mean_price))
print ("Median price ${:,.2f}".format(median_price))
print ("Standard deviation of prices: ${:,.2f}".format(std_price))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
clr = ['blue', 'green', 'red']

In [None]:
fig, axs = plt.subplots(ncols=3,figsize=(15,3))

plt.figure(1)

for i, var in enumerate(['RM', 'LSTAT', 'PTRATIO']):
    plt.subplot(131 + i)
    sns.distplot(data[var],  color = clr[i])
    plt.axvline(data[var].mean(), color=clr[i], linestyle='solid', linewidth=2)
    plt.axvline(data[var].median(), color=clr[i], linestyle='dashed', linewidth=2)

In [None]:
fig, axs = plt.subplots(ncols=3,figsize=(15,3))

plt.figure(1)

for i, var in enumerate(['RM', 'LSTAT', 'PTRATIO']):
    plt.subplot(131 + i)
    if i==0:
        sns.distplot(data[var],  color = clr[i])
        plt.axvline(data[var].mean(), color=clr[i], linestyle='solid', linewidth=2)
        plt.axvline(data[var].median(), color=clr[i], linestyle='dashed', linewidth=2)
    else:
        sns.distplot(np.log(data[var]), color = clr[i])
        plt.axvline(np.log(data[var]).mean(), color=clr[i], linestyle='solid', linewidth=2)
        plt.axvline(np.log(data[var]).median(), color=clr[i], linestyle='dashed', linewidth=2)

In [None]:
fig, axs = plt.subplots(ncols=3,figsize=(15,3))

for i, var in enumerate(['RM', 'LSTAT', 'PTRATIO']):
    lm = sns.regplot(data[var], prices, ax = axs[i], color=clr[i])
    lm.set(ylim=(0, None))

In [None]:
fig, axs = plt.subplots(ncols=3,figsize=(15,3))

for i, var in enumerate(['RM', 'LSTAT', 'PTRATIO']):
    lm = sns.regplot(np.log(data[var]), prices, ax = axs[i], color=clr[i])
    lm.set(ylim=(0, None))

In [None]:
sns.heatmap(data.corr(), square=True,annot=True)

In [None]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    score = r2_score(y_true, y_predict)
    return score

In [None]:
score = performance_metric([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3])
print ("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

In [None]:
sample_df = pd.DataFrame([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3]).reset_index()
sample_df.columns = ['True Value', 'Prediction']
sns.regplot('True Value', 'Prediction', sample_df)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, prices, 
                                                    test_size=30, random_state=0)

# Success
print ("Training and testing split was successful.")

In [None]:
'''def forward(x, w):
    out = x.T * w
    return out

def backward(x, y, dout):
    dx = dout * y
    dy = dout * x.T
    return dx, dy
def sigmoid(x):
    return 1/(1+np.exp(-x))'''

In [None]:
'''#X_train, X_test, y_train, y_test
Ln = 1.1
Zero2One = forward(X_test, y_test)
#Zero2One = sigmoid(Zero2One)
print(Zero2One)
One2Two = forward(Zero2One, Ln)
#One2Two = sigmoid(One2Two)
print(One2Two)'''

In [None]:
'''# back propagation
dOne2Two = 1
dZero2One, dLn = backward(X_test, y_test, dOne2Two)
dX_test, dy_test = backward(X_test, y_test, dZero2One)

print(dX_test)
print()
print(dy_test)
print()
print(dLn)'''

In [None]:
from tqdm.notebook import tqdm

X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)

W1 = W2 = W3 = b = 0.0

n_data = len(X_test)
epochs = 150
learning_rate = 0.01

for i in tqdm(range(epochs)):
    gradient_w1 = np.sum((X_test.T[0] * W1 - y_test.T[0] + b) * 2 * X_test.T[0]) / n_data
    gradient_w2 = np.sum((X_test.T[1] * W2 - y_test.T[1] + b) * 2 * X_test.T[1]) / n_data
    gradient_w3 = np.sum((X_test.T[2] * W3 - y_test.T[2] + b) * 2 * X_test.T[2]) / n_data
    gradient_b = np.sum((X_test.T[0] * W1 + X_test.T[1] * W2 + X_test.T[2] * W3 - y_test.T + b) * 2) / n_data

    W1 -= learning_rate * gradient_w1
    W2 -= learning_rate * gradient_w2
    W3 -= learning_rate * gradient_w3
    b -= learning_rate * gradient_b

print(W1)
print(W2)
print(W3)
print(b)

In [None]:
print(X_test.dot(np.array([W1, W2, W3]).T) )

In [None]:
X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)

lr = 0.01

def Hyp(w, b, x):
    return x.dot(w.T) + b
def MSE(w, b, x, y):
    n = len(x)
    return 1/n * ((Hyp(w, b, x)-y)**2).sum()
def dMSE(w, b, x, y):
    dS = [x[0], x[1], x[2], 1]
    Synfunc = 2/len(X_train) * (Hyp(w, b, x) - y)
    
    dy = []
    for i in range(4):
        dy.append(Synfunc * dS[i])
    
    return dy

w = np.array([0.0, 0.0, 0.0])
b = 0.0

for _ in range(2000):
    
    for i in range(n):
        grad = dMSE(w, b, X_train[i], y_train[i])
        w[0] -= lr * grad[0]
        w[1] -= lr * grad[1]
        w[2] -= lr * grad[2]
        b -= lr * grad[3]
    
    print("%d Epoch, Cost) %6f, Cost_test) %6f"%(_+1, MSE(w, b, X_train, y_train), MSE(w, b, X_test, y_test)))

print(w)
print(b)

In [None]:
W = (np.linalg.inv((X_train.T).dot(X_train)).dot(X_train.T).dot(y_train))
print(W)
print(MSE(W, 0, X_train, y_train))
print(12423055095.874508 - 8166308337.510345)

In [None]:
print(dMSE(w, b, X_train[i], y_train[i]))