In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [7]:
dataset = pd.read_csv('../datasets/house_prices_dataset.csv')
dataset.head()

Unnamed: 0,Size,Number of Bedrooms,Number of Bathrooms,Location,Age of the House,Garage Size,Sale Price
0,3974,1,2,Countryside,81,2,594026
1,1660,3,2,Downtown,97,1,847135
2,2094,2,1,Suburb,74,3,163776
3,1930,2,2,Countryside,75,2,230494
4,1895,1,1,Downtown,48,1,242971


In [8]:
#Check how many missing values per column
dataset.isnull().sum()

Size                   0
Number of Bedrooms     0
Number of Bathrooms    0
Location               0
Age of the House       0
Garage Size            0
Sale Price             0
dtype: int64

In [9]:
#We determine the X and y

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:,-1].values


In [10]:
#We encode the categorical variable

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [3])], remainder ='passthrough')
X = np.array(ct.fit_transform(X))
X

array([[1.0, 0.0, 0.0, ..., 2, 81, 2],
       [0.0, 1.0, 0.0, ..., 2, 97, 1],
       [0.0, 0.0, 1.0, ..., 1, 74, 3],
       ...,
       [1.0, 0.0, 0.0, ..., 2, 61, 0],
       [0.0, 0.0, 1.0, ..., 1, 96, 2],
       [0.0, 0.0, 1.0, ..., 3, 18, 1]], dtype=object)

In [11]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
#  Multiple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

In [13]:
#Y pred
y_pred = model.predict(X_test)

In [14]:
r2_score(y_test, y_pred)

-0.038702464202095

In [15]:
####### POLYNOMIAL ##############

In [16]:
# Training the Polynomial Regression model on the Training set
poly_reg = PolynomialFeatures(degree=6)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)


In [17]:
# Predicting the Test set results
y_pred = regressor.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[ 414020.13  565249.  ]
 [ 759060.71  613634.  ]
 [ 357553.58  972056.  ]
 [ 405835.39  148064.  ]
 [ 751580.5   161462.  ]
 [ 782684.98  133384.  ]
 [ 413910.51  336073.  ]
 [ 425561.99  912605.  ]
 [ 565817.31  390414.  ]
 [ 398925.32  984550.  ]
 [ 943909.62  376284.  ]
 [ 542410.9   820974.  ]
 [ 778085.83  100519.  ]
 [ 614318.85  490015.  ]
 [ 457542.8   537387.  ]
 [ 393164.37  222502.  ]
 [ 354515.92  590277.  ]
 [ 891020.69  509158.  ]
 [ 511252.68  286371.  ]
 [ 533915.17  936788.  ]
 [ 579189.88  645376.  ]
 [ 796496.21  513685.  ]
 [ 618930.15  906874.  ]
 [ 315698.92  407776.  ]
 [ 444886.97  396030.  ]
 [ 619640.11  408105.  ]
 [ 535686.35  474724.  ]
 [ 235212.78  539415.  ]
 [ 617665.14  565795.  ]
 [ 481094.36  847135.  ]
 [ 491414.7   922437.  ]
 [ 547714.41  972564.  ]
 [ 695139.91  869497.  ]
 [ 507539.36  941490.  ]
 [ 523458.97  359509.  ]
 [ 398987.51  680532.  ]
 [ 478453.93  992585.  ]
 [ 475350.2   528697.  ]
 [ 547281.39  994556.  ]
 [ 575388.74  331373.  ]


In [18]:
r2_score(y_test, y_pred)

-0.9311369296986676