# Data Preprocessing

In [347]:
# Importing the dataset
import pandas

dataset = pandas.read_csv("./AmesHousing.csv");

In [348]:
from sklearn.impute import SimpleImputer
#impute missing values in numeric categories with the median
numeric_columns = dataset.select_dtypes(include='number').columns
imputerNum = SimpleImputer(strategy="median")
dataset[numeric_columns] = imputerNum.fit_transform(dataset[numeric_columns])

# Impute missing categorical values to "NA"
categorical_columns = dataset.select_dtypes(include='object').columns
imputerCat = SimpleImputer(strategy='constant', fill_value='NA')
dataset[categorical_columns] = imputerCat.fit_transform(dataset[categorical_columns])

In [None]:
from sklearn.preprocessing import OneHotEncoder

ordinalMapping = {
	"NA": -1,
	"Po": 0,
	"Fa": 1,
	"TA": 2,
	"Gd": 3,
	"Ex": 4
}

hotEncoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Ordering the orderable features and nulling the unorderable ones
categories = {}
for column in dataset.columns:
	columnData = dataset[column]
	if columnData.dtype == 'object':
		if columnData.apply(lambda item: item in ordinalMapping).all():
			print(dataset[column].name)
			dataset[column] = columnData.apply(lambda item: ordinalMapping[item])
		else:
			newData = hotEncoder.fit_transform(dataset[[column]])
			newColumns = pandas.DataFrame(newData, columns=hotEncoder.get_feature_names_out([column]))
			dataset = pandas.concat([dataset, newColumns], axis=1).drop(column, axis=1)
dataset

# Finding correlations between the variables
By setting up a correlation table and printing it, we can find which values are good candidates for regression.

In [None]:
# Checking the correlation between the different columns and the sales price
correlation = dataset.corr();
print(correlation["SalePrice"].sort_values(ascending=False).to_string())

In [None]:
from matplotlib import pyplot as plot
saleprice = dataset["SalePrice"]
plots = [
	"Overall Qual",
	"Gr Liv Area",
	"Garage Area",
	"Total Bsmt SF",
	"Year Built",
	"Year Remod/Add",
	"Garage Yr Blt",
	"Mas Vnr Area"
]

for i in plots:
	plot.scatter(dataset[i], saleprice)
	plot.title(i)
	plot.show()
	plot.close()

# Linear Regression
This is where we perform linear regression on the data. For this, we use the "Gross Living Area" feature in order to predict the sale price. Although "Overall Quality" has a higher correlation with sale price, we chose to use Gross Living Area because of how the value, despite being numerical, is an ordinal categorical value between 0 and 10. This does not necessarily make it a bad fit for the data, but it makes it less precise for regression (due to data attempting to fit only specific intervals, rather than using a continuous stream of data)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Setting up the values for regression
quality_plot = np.array(dataset["Gr Liv Area"]).reshape(-1, 1)
sale_price_plot = np.array(dataset["SalePrice"]).reshape(-1, 1)

# Splitting into training and test set
x_train, x_test, y_train, y_test = train_test_split(quality_plot, sale_price_plot, test_size=0.2, random_state=42)

print(f"{len(x_train)} train instances + {len(x_test)} test instances")

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plot

# Performing linear regression
linear_regression = LinearRegression()
linear_regression.fit(x_train, y_train)

# Predicting the y-test results
y_predicted = linear_regression.predict(x_test)

# Plotting the test data agains the predicted test result
plot.scatter(x_test, y_test)
plot.plot(x_test, y_predicted, "-r")

In [None]:
from sklearn.metrics import mean_squared_error

y_predicted = linear_regression.predict(x_test)
lin_mse = mean_squared_error(y_test, y_predicted)
lin_rmse = np.sqrt(lin_mse)
print(f"Root Mean Square Error: {lin_rmse}")

# Polynomial Regression
For polynomial regression, the "Year Built" feature seems to have a strangely exponential impact on the price. Therefore we have chosen to investigate that correlation

In [None]:
from sklearn.preprocessing import PolynomialFeatures

x_axis = "Year Built"
linear_regression = LinearRegression()

poly = PolynomialFeatures(degree=3)
x_poly = poly.fit_transform(dataset[[x_axis]])
y_axis = dataset["SalePrice"]

x_train, x_test, y_train, y_test = train_test_split(x_poly, y_axis, test_size=0.2, random_state=42)
linear_regression.fit(x_train, y_train)

y_pred = linear_regression.predict(x_test)

plot.scatter(x_test[:,1], y_test)
plot.scatter(x_test[:,1], y_pred)

In [None]:
lin_mse = mean_squared_error(y_test, y_pred)
lin_rmse = np.sqrt(lin_mse)
print(f"Root Mean Square Error: {lin_rmse}")

# Random Forest Regression