Question 3:Please see the house prices dataset.Use the appropriate algorithm (from question 1 or 2) to learn a model from the training set and predict the prices for the test set. 				
1.Report your average error in the prediction.

In [1]:
import pandas as pd
import numpy as np
from math import exp
from math import sqrt
from csv import reader
from sklearn.model_selection import train_test_split
import seaborn as sns
from random import seed
from random import randrange



In [2]:
# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column])

In [3]:
#Data with numerical feature 

def data_numerical_feature(dataset):
    pct_null = dataset.isnull().sum() / len(dataset)
    missing_features = pct_null[pct_null > 0.00].index
    dataset.drop(missing_features, axis=1, inplace=True) #droping columns with NA
    
    numerical_features=[feature for feature in dataset.columns if dataset[feature].dtypes != 'O']
    
    ds = dataset[numerical_features]    
    ds.drop(['Id'], axis=1, inplace=True)   
    return ds

In [4]:
# Find the min and max values for each column
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

In [5]:
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) \
             / (minmax[i][1] - minmax[i][0])

In [6]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [7]:
# Calculate root mean squared error
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)

In [8]:
#Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        rmse = rmse_metric(actual, predicted)
        scores.append(rmse)
    return scores

In [9]:
# Make a prediction with coefficients
def predict(row, w):
    yhat = w[-1]                          # bias is at dimnesion 0
    for i in range(len(row)-1):
        yhat += w[i] * row[i]
    return yhat

In [10]:
# Estimate linear regression coefficients using stochastic gradient descent
def weights_sgd(train, l_rate, n_epoch):
    w = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            yhat = predict(row, w)
            error = row[-1]- yhat
            sum_error += error**2
            w[-1] = w[-1] + l_rate * error
            for i in range(len(row)-1):
                w[i] = w[i] + l_rate * error * row[i]
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
    return w

In [11]:
def linear_regression(train, test, l_rate, n_epoch):
    predictions = list()
    w = weights_sgd(train, l_rate, n_epoch)
    #print('W',w)
    for row in test:
        yhat = predict(row, w)
        predictions.append(yhat)
    return(predictions)

In [12]:
# load and prepare data
seed(1)

filename = 'dataset/house_train.csv'
df = pd.read_csv(filename)

#column with numerical values
ds = data_numerical_feature(df)

ds=ds.values
final_dataset = ds.tolist()

for i in range(len(final_dataset[0])):
    str_column_to_float(final_dataset, i)

#normalization
minmax = dataset_minmax(final_dataset)
normalize_dataset(final_dataset, minmax)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds.drop(['Id'], axis=1, inplace=True)


In [13]:
#Test Splitting training data into training set and test set
n_folds =5
l_rate = 0.01
n_epoch = 50

train_ds, test_ds = train_test_split(final_dataset, test_size = 0.3, random_state = 10)

w = weights_sgd(train_ds,l_rate,n_epoch)
#print(w)

#Predicted house sale price
print('\n')
for row in test_ds:
    yhat = predict(row, w)
    print("Expected=%.3f, Predicted=%.3f [%d]" % (row[-1], yhat, round(yhat)))

>epoch=0, lrate=0.010, error=6.461
>epoch=1, lrate=0.010, error=3.974
>epoch=2, lrate=0.010, error=3.529
>epoch=3, lrate=0.010, error=3.332
>epoch=4, lrate=0.010, error=3.211
>epoch=5, lrate=0.010, error=3.124
>epoch=6, lrate=0.010, error=3.057
>epoch=7, lrate=0.010, error=3.004
>epoch=8, lrate=0.010, error=2.960
>epoch=9, lrate=0.010, error=2.925
>epoch=10, lrate=0.010, error=2.895
>epoch=11, lrate=0.010, error=2.870
>epoch=12, lrate=0.010, error=2.849
>epoch=13, lrate=0.010, error=2.831
>epoch=14, lrate=0.010, error=2.816
>epoch=15, lrate=0.010, error=2.804
>epoch=16, lrate=0.010, error=2.793
>epoch=17, lrate=0.010, error=2.783
>epoch=18, lrate=0.010, error=2.775
>epoch=19, lrate=0.010, error=2.768
>epoch=20, lrate=0.010, error=2.762
>epoch=21, lrate=0.010, error=2.756
>epoch=22, lrate=0.010, error=2.751
>epoch=23, lrate=0.010, error=2.747
>epoch=24, lrate=0.010, error=2.743
>epoch=25, lrate=0.010, error=2.740
>epoch=26, lrate=0.010, error=2.737
>epoch=27, lrate=0.010, error=2.734
>e

In [14]:
#Calculating Average error
scores = evaluate_algorithm(test_ds, linear_regression, n_folds, l_rate, n_epoch)
print('\n')
print('Scores: %s' % scores)
print('Mean RMSE: %.3f'% (sum(scores)/float(len(scores))))

>epoch=0, lrate=0.010, error=2.641
>epoch=1, lrate=0.010, error=1.612
>epoch=2, lrate=0.010, error=1.305
>epoch=3, lrate=0.010, error=1.137
>epoch=4, lrate=0.010, error=1.038
>epoch=5, lrate=0.010, error=0.974
>epoch=6, lrate=0.010, error=0.929
>epoch=7, lrate=0.010, error=0.896
>epoch=8, lrate=0.010, error=0.870
>epoch=9, lrate=0.010, error=0.849
>epoch=10, lrate=0.010, error=0.831
>epoch=11, lrate=0.010, error=0.816
>epoch=12, lrate=0.010, error=0.802
>epoch=13, lrate=0.010, error=0.790
>epoch=14, lrate=0.010, error=0.779
>epoch=15, lrate=0.010, error=0.769
>epoch=16, lrate=0.010, error=0.760
>epoch=17, lrate=0.010, error=0.751
>epoch=18, lrate=0.010, error=0.743
>epoch=19, lrate=0.010, error=0.736
>epoch=20, lrate=0.010, error=0.729
>epoch=21, lrate=0.010, error=0.723
>epoch=22, lrate=0.010, error=0.717
>epoch=23, lrate=0.010, error=0.711
>epoch=24, lrate=0.010, error=0.706
>epoch=25, lrate=0.010, error=0.700
>epoch=26, lrate=0.010, error=0.696
>epoch=27, lrate=0.010, error=0.691
>e

>epoch=30, lrate=0.010, error=0.594
>epoch=31, lrate=0.010, error=0.591
>epoch=32, lrate=0.010, error=0.588
>epoch=33, lrate=0.010, error=0.585
>epoch=34, lrate=0.010, error=0.582
>epoch=35, lrate=0.010, error=0.580
>epoch=36, lrate=0.010, error=0.577
>epoch=37, lrate=0.010, error=0.575
>epoch=38, lrate=0.010, error=0.573
>epoch=39, lrate=0.010, error=0.571
>epoch=40, lrate=0.010, error=0.569
>epoch=41, lrate=0.010, error=0.567
>epoch=42, lrate=0.010, error=0.565
>epoch=43, lrate=0.010, error=0.563
>epoch=44, lrate=0.010, error=0.561
>epoch=45, lrate=0.010, error=0.560
>epoch=46, lrate=0.010, error=0.558
>epoch=47, lrate=0.010, error=0.557
>epoch=48, lrate=0.010, error=0.555
>epoch=49, lrate=0.010, error=0.554


Scores: [0.04195193303773983, 0.03840494051252803, 0.052716019893337754, 0.03584394722233829, 0.053651106538486204]
Mean RMSE: 0.045


#Report your average error in the prediction 

Scores: [0.04195193303773983, 0.03840494051252803, 0.052716019893337754, 0.03584394722233829, 0.053651106538486204]
Mean RMSE: 0.045

In [16]:
# Test with test data where actual price is missing

file_test = 'dataset/house_test.csv'
df_test = pd.read_csv(file_test)

#separating the column with numerical values
ds_test = data_numerical_feature(df_test)

ds_test = ds_test.values
final_ds_test = ds_test.tolist()

for i in range(len(final_ds_test[0])):
    str_column_to_float(final_ds_test, i)

#normalization
minmax = dataset_minmax(final_ds_test)
normalize_dataset(final_ds_test, minmax)

l_rate = 0.01
n_epoch = 50

w = weights_sgd(final_dataset,l_rate,n_epoch)

print('\n')
for row in final_ds_test:
    yhat = predict(row, w)
    print("Predicted=%.3f [%d]" % (yhat, round(yhat)))

>epoch=0, lrate=0.010, error=7.706
>epoch=1, lrate=0.010, error=4.869
>epoch=2, lrate=0.010, error=4.422
>epoch=3, lrate=0.010, error=4.208
>epoch=4, lrate=0.010, error=4.067
>epoch=5, lrate=0.010, error=3.965
>epoch=6, lrate=0.010, error=3.888
>epoch=7, lrate=0.010, error=3.829
>epoch=8, lrate=0.010, error=3.783
>epoch=9, lrate=0.010, error=3.746
>epoch=10, lrate=0.010, error=3.718
>epoch=11, lrate=0.010, error=3.695
>epoch=12, lrate=0.010, error=3.676
>epoch=13, lrate=0.010, error=3.661
>epoch=14, lrate=0.010, error=3.649
>epoch=15, lrate=0.010, error=3.638
>epoch=16, lrate=0.010, error=3.630
>epoch=17, lrate=0.010, error=3.623
>epoch=18, lrate=0.010, error=3.616
>epoch=19, lrate=0.010, error=3.611
>epoch=20, lrate=0.010, error=3.607
>epoch=21, lrate=0.010, error=3.603
>epoch=22, lrate=0.010, error=3.599
>epoch=23, lrate=0.010, error=3.596
>epoch=24, lrate=0.010, error=3.593
>epoch=25, lrate=0.010, error=3.591
>epoch=26, lrate=0.010, error=3.589
>epoch=27, lrate=0.010, error=3.587
>e