Question 4:	Please modify the above dataset (in question 3) to answer whether the house will sell for 180000 or not and use the appropriate algorithm (from question 1 or 2) to learn a model from the training set and answer whether the prices > 180000 or not for the test set

In [1]:
import pandas as pd
import numpy as np
from math import exp
from math import sqrt
from csv import reader
from sklearn.model_selection import train_test_split
import seaborn as sns
from random import seed
from random import randrange

In [26]:
#Data with numerical feature and droping NA value columns

def data_numerical_feature(dataset):
    pct_null = dataset.isnull().sum() / len(dataset)
    missing_features = pct_null[pct_null > 0.00].index
    #print('Number of missing_features: ', missing_features)
    dataset.drop(missing_features, axis=1, inplace=True)
    
    numerical_features=[feature for feature in dataset.columns if dataset[feature].dtypes != 'O']
    #print('Number of numerical variables: ', len(numerical_features))
    
    ds = dataset[numerical_features]    
    ds.drop(['Id'], axis=1, inplace=True)   
    return ds

In [3]:
# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column])

In [4]:
# Find the min and max values for each column
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

In [5]:
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

In [6]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [7]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(0,len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [8]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:            
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [9]:
# Make a prediction with coefficients
def predict(row, w):
    yhat = w[-1]                           # bias is at dimnesion 0
    for i in range(len(row)-1):
        yhat += w[i] * row[i]
    return 1.0 / (1.0 + exp(-yhat))

In [10]:
# Estimate logistic regression coefficients 
#using stochastic gradient descent
def weights_sgd(train, l_rate, n_epoch):
    w = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            yhat = predict(row, w)
            error = row[-1] - yhat
            sum_error += error**2
            w[-1] = w[-1] + l_rate * \
            error * yhat * (1.0 - yhat)
            for i in range(len(row)-1):
                w[i] = w[i] + l_rate * error * \
                yhat * (1.0 - yhat) * row[i]
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
    return w


In [11]:
# Logistic Regression Algorithm With Stochastic Gradient Descent
def logistic_regression(train, test, l_rate, n_epoch):
    predictions = list()
    w = weights_sgd(train, l_rate, n_epoch)
    for row in test:
        yhat = predict(row, w)
        yhat = round(yhat)
        predictions.append(yhat)
    return(predictions)

In [12]:
# Test logistic regression algorithm on the housing dataset
seed(1)

# load and prepare data
filename = 'dataset/house_train.csv'
df = pd.read_csv(filename)

# Adding column Salelabel for Saleprice greater or less than 180000
df['SaleLabel'] = df['SalePrice'].map(lambda m:1 if m >180000 else 0)

#Numerical column
ds = data_numerical_feature(df)

final_dataset=ds.values
final_ds = final_dataset.tolist()

for i in range(0,len(final_ds[0])):
    str_column_to_float(final_ds, i)

# normalize
minmax = dataset_minmax(final_ds)
normalize_dataset(final_ds, minmax)

n_folds = 5
l_rate = 0.03
n_epoch = 50
scores = evaluate_algorithm(final_ds, logistic_regression, n_folds, l_rate, n_epoch)
print("/n")
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


>epoch=0, lrate=0.030, error=256.543
>epoch=1, lrate=0.030, error=208.759
>epoch=2, lrate=0.030, error=181.144
>epoch=3, lrate=0.030, error=163.786
>epoch=4, lrate=0.030, error=151.898
>epoch=5, lrate=0.030, error=143.195
>epoch=6, lrate=0.030, error=136.501
>epoch=7, lrate=0.030, error=131.156
>epoch=8, lrate=0.030, error=126.763
>epoch=9, lrate=0.030, error=123.070
>epoch=10, lrate=0.030, error=119.907
>epoch=11, lrate=0.030, error=117.158
>epoch=12, lrate=0.030, error=114.739
>epoch=13, lrate=0.030, error=112.586
>epoch=14, lrate=0.030, error=110.655
>epoch=15, lrate=0.030, error=108.907
>epoch=16, lrate=0.030, error=107.314
>epoch=17, lrate=0.030, error=105.855
>epoch=18, lrate=0.030, error=104.510
>epoch=19, lrate=0.030, error=103.265
>epoch=20, lrate=0.030, error=102.106
>epoch=21, lrate=0.030, error=101.025
>epoch=22, lrate=0.030, error=100.011
>epoch=23, lrate=0.030, error=99.058
>epoch=24, lrate=0.030, error=98.159
>epoch=25, lrate=0.030, error=97.310
>epoch=26, lrate=0.030, e

>epoch=35, lrate=0.030, error=94.722
>epoch=36, lrate=0.030, error=94.196
>epoch=37, lrate=0.030, error=93.689
>epoch=38, lrate=0.030, error=93.198
>epoch=39, lrate=0.030, error=92.723
>epoch=40, lrate=0.030, error=92.263
>epoch=41, lrate=0.030, error=91.817
>epoch=42, lrate=0.030, error=91.383
>epoch=43, lrate=0.030, error=90.962
>epoch=44, lrate=0.030, error=90.553
>epoch=45, lrate=0.030, error=90.154
>epoch=46, lrate=0.030, error=89.766
>epoch=47, lrate=0.030, error=89.388
>epoch=48, lrate=0.030, error=89.019
>epoch=49, lrate=0.030, error=88.660
Scores: [91.78082191780823, 88.35616438356165, 90.41095890410958, 91.0958904109589, 93.83561643835617]
Mean Accuracy: 91.096%


#Mean accuracy on the training data
Scores: [91.78082191780823, 88.35616438356165, 90.41095890410958, 91.0958904109589, 93.83561643835617]
Mean Accuracy: 91.096%

In [39]:
# Test Splitting training data into training set and test set

l_rate = 0.01
n_epoch = 50
train_ds, test_ds = train_test_split(final_ds, test_size = 0.2, random_state = 10)
w = weights_sgd(train_ds,l_rate,n_epoch)
#print(w)
    
print('\n')
sold_status = []
for row in test_ds:
    yhat = predict(row, w)
    print("Expected=%.3f, Predicted=%.3f [%d]" % (row[-1], yhat, round(yhat)))
    if round(yhat) == 1:
        status = "Sold"
    else:
        status = "Not Sold"
    sold_status.append([round(row[-1]),round(yhat),status])

>epoch=0, lrate=0.010, error=277.212
>epoch=1, lrate=0.010, error=254.343
>epoch=2, lrate=0.010, error=235.718
>epoch=3, lrate=0.010, error=220.334
>epoch=4, lrate=0.010, error=207.594
>epoch=5, lrate=0.010, error=196.958
>epoch=6, lrate=0.010, error=187.989
>epoch=7, lrate=0.010, error=180.346
>epoch=8, lrate=0.010, error=173.764
>epoch=9, lrate=0.010, error=168.039
>epoch=10, lrate=0.010, error=163.014
>epoch=11, lrate=0.010, error=158.567
>epoch=12, lrate=0.010, error=154.600
>epoch=13, lrate=0.010, error=151.037
>epoch=14, lrate=0.010, error=147.817
>epoch=15, lrate=0.010, error=144.890
>epoch=16, lrate=0.010, error=142.215
>epoch=17, lrate=0.010, error=139.759
>epoch=18, lrate=0.010, error=137.494
>epoch=19, lrate=0.010, error=135.397
>epoch=20, lrate=0.010, error=133.449
>epoch=21, lrate=0.010, error=131.632
>epoch=22, lrate=0.010, error=129.933
>epoch=23, lrate=0.010, error=128.340
>epoch=24, lrate=0.010, error=126.842
>epoch=25, lrate=0.010, error=125.429
>epoch=26, lrate=0.010

#Sold Status on the Test Data

In [41]:
pd.DataFrame(sold_status , columns = ["Actual Status", "Predicted Status" , "Status"])

Unnamed: 0,Actual Status,Predicted Status,Status
0,0,1,Sold
1,1,1,Sold
2,0,0,Not Sold
3,0,0,Not Sold
4,1,1,Sold
...,...,...,...
287,0,0,Not Sold
288,1,0,Not Sold
289,1,1,Sold
290,0,1,Sold


In [22]:
#Calculate accuracy

n_folds = 5
l_rate = 0.03
n_epoch = 50
scores = evaluate_algorithm(test_ds, logistic_regression, n_folds, l_rate, n_epoch)

print('\n')
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

>epoch=0, lrate=0.030, error=55.017
>epoch=1, lrate=0.030, error=52.303
>epoch=2, lrate=0.030, error=49.978
>epoch=3, lrate=0.030, error=47.788
>epoch=4, lrate=0.030, error=45.804
>epoch=5, lrate=0.030, error=44.025
>epoch=6, lrate=0.030, error=42.431
>epoch=7, lrate=0.030, error=41.001
>epoch=8, lrate=0.030, error=39.717
>epoch=9, lrate=0.030, error=38.559
>epoch=10, lrate=0.030, error=37.512
>epoch=11, lrate=0.030, error=36.562
>epoch=12, lrate=0.030, error=35.696
>epoch=13, lrate=0.030, error=34.905
>epoch=14, lrate=0.030, error=34.178
>epoch=15, lrate=0.030, error=33.510
>epoch=16, lrate=0.030, error=32.892
>epoch=17, lrate=0.030, error=32.319
>epoch=18, lrate=0.030, error=31.787
>epoch=19, lrate=0.030, error=31.291
>epoch=20, lrate=0.030, error=30.827
>epoch=21, lrate=0.030, error=30.392
>epoch=22, lrate=0.030, error=29.983
>epoch=23, lrate=0.030, error=29.598
>epoch=24, lrate=0.030, error=29.235
>epoch=25, lrate=0.030, error=28.891
>epoch=26, lrate=0.030, error=28.566
>epoch=27, 

>epoch=31, lrate=0.030, error=28.132
>epoch=32, lrate=0.030, error=27.896
>epoch=33, lrate=0.030, error=27.669
>epoch=34, lrate=0.030, error=27.451
>epoch=35, lrate=0.030, error=27.242
>epoch=36, lrate=0.030, error=27.041
>epoch=37, lrate=0.030, error=26.847
>epoch=38, lrate=0.030, error=26.660
>epoch=39, lrate=0.030, error=26.480
>epoch=40, lrate=0.030, error=26.306
>epoch=41, lrate=0.030, error=26.138
>epoch=42, lrate=0.030, error=25.975
>epoch=43, lrate=0.030, error=25.817
>epoch=44, lrate=0.030, error=25.665
>epoch=45, lrate=0.030, error=25.517
>epoch=46, lrate=0.030, error=25.373
>epoch=47, lrate=0.030, error=25.233
>epoch=48, lrate=0.030, error=25.098
>epoch=49, lrate=0.030, error=24.966


Scores: [84.48275862068965, 77.58620689655173, 84.48275862068965, 89.65517241379311, 93.10344827586206]
Mean Accuracy: 85.862%


#Question 4, Part 1

Scores: [84.48275862068965, 77.58620689655173, 84.48275862068965, 89.65517241379311, 93.10344827586206]
Mean Accuracy: 85.862%