# Implementation of Basic Machine Learning Algorithms on Ames Housing Dataset (Regression Task)

This notebook implements Linear Regression on the Ames Housing Dataset. The dataset has been provided by Kaggle. https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from LinearRegression import LinearRegression

In [3]:
def plot_curve(Xlist, Ylist, title, xlabel, ylabel, plotlabels):
    plt.figure()
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid()
    cnt = 0
    for X in Xlist:
        print(X)
        Y = Ylist[cnt]
        print(Y)
        label = plotlabels[cnt]
        print(label)
        plt.plot(X, Y, 'o-', label=label)
        cnt += 1
    #plt.show()

def load_dataset():
    train_initial = pd.read_csv('datasets/train.csv')
    test_initial = pd.read_csv('datasets/test.csv')
    return train_initial, test_initial

train_initial, test_initial = load_dataset()
print("initial train data size", train_initial.shape)

#print(train_initial.columns.values[train_initial.isnull().sum() > 0])
#print(test_initial.columns.values[test_initial.isnull().sum() > 0])
#print(train_initial['GarageYrBlt'].isnull().sum())
#print(train_initial.mode()['MasVnrArea'])
#print(train_initial.dtypes)

train_2 = train_initial.copy()

# ignore Id; ignore MasVnrArea and GarageYrBlt too for now as no easy way of handling missing values
train_2 = train_2.drop(['Id', 'MasVnrArea', 'GarageYrBlt'], axis=1)

# transform year variables to be, 'years since 2018'
train_2[['YearBuilt', 'YearRemodAdd', 'YrSold']] = 2018 - train_2[['YearBuilt', 'YearRemodAdd', 'YrSold']]

numeric_features = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
 'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold', 'SalePrice']

# get non-numeric columns
non_numeric_columns = np.setdiff1d(train_2.columns.values,numeric_features)

# create dummy variables for categorical variables
# TODO: ideally would want to create these based on a list of possible feature values,
# otherwise will have to combine with test data and then create dummy variables 
# as some feature values might be present in one set but not the other sample
train_2 = pd.get_dummies(train_2, columns=non_numeric_columns, drop_first=True, dummy_na=True)
print("updated training data size", train_2.shape)

# interaction variable for MiscVal based on type of MiscFeature
miscfeatures = [col for col in train_2.columns.values if col.startswith('MiscFeature')]

train_2[miscfeatures] = train_2[miscfeatures].multiply(train_2['MiscVal'],axis="index")

#print(train_2.head(5))

#print(train_2[train_2.columns[-15:]].head())
#print(train_2.dtypes)

learning_rate = [0.000000001, 0.000000005]
max_iter = 1000
iteration_threshold = 100

#cost_by_lr = np.array([])
#iterations = np.array([])
#plotlabels = np.array([])

cost_by_lr = np.empty((0,math.floor(max_iter/iteration_threshold)+1))
iterations = np.empty((0,math.floor(max_iter/iteration_threshold)+1))
plotlabels = []

for lr in learning_rate:
    estimator_linReg = LinearRegression(learning_rate=lr, reg_strength=0, regularization="Ridge", 
                                    max_iter=max_iter, gd_threshold=None, iteration_threshold=iteration_threshold)


    train_2 = train_2.fillna(train_2.mean())

    #print(train_2.columns.values[train_2.isnull().sum() > 0])

    train_X = train_2.drop(['SalePrice'], axis=1).values
    #print(train_X[:10,])/9
    #train_X = train_X[0:10,:2]
    #print(train_X)
    print(train_X.shape)
    train_y = train_2[['SalePrice']].values
    #train_y = train_y[0:10,]
    train_y = train_y / 100000
    print(train_y.shape)

    #print(train_y)

    estimator_linReg.fit(train_X, train_y)
    estimator_linReg.final_cost
    
    #print(estimator_linReg.cost_by_iteration)

    cost_by_lr = np.vstack((cost_by_lr, estimator_linReg.cost_by_iteration))
    
    #print(cost_by_lr)
    iterations = np.vstack((iterations, estimator_linReg.iterations))
    plotlabels.append("Learning rate = "+str(lr))
    #iterations = np.append(iterations, estimator_linReg.iterations)
    #plotlabels = np.append(plotlabels, "Learning rate = "+str(lr))

#print(cost_by_iter)
#print(iterations)

plot_curve(Ylist=cost_by_lr, Xlist=iterations, title="Curve for cost v/s iteration by learning rate", 
           xlabel="Number of iterations", ylabel="Cost", plotlabels=plotlabels)

print("Done")

initial train data size (1460, 81)
updated training data size (1460, 329)
(1460, 328)
(1460, 1)
(1460, 328)
(1460, 1)
[    0.   100.   200.   300.   400.   500.   600.   700.   800.   900.
  1000.]
[ 1.39712683  0.74986931  0.61880285  0.5258194   0.46182226  0.41914873
  0.39149702  0.37396735  0.36299977  0.35616317  0.35187533]
Learning rate = 1e-09
[    0.   100.   200.   300.   400.   500.   600.   700.   800.   900.
  1000.]
[ 1.39712683  0.41826198  0.35172074  0.34455182  0.3424598   0.34098679
  0.33975176  0.3386846   0.33774768  0.33691335  0.33616053]
Learning rate = 5e-09
Done
