# The linear regression project looks to predict the house price of unit area. Numpy and Scikit Learn were used to create the model and make predictions.

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [31]:
data = pd.read_csv('Real estate.csv')

In [32]:
data.head(11)

Unnamed: 0,No,transaction date,house age,distance to the nearest MRT station,number of convenience stores,latitude,longitude,house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1
5,6,2012.667,7.1,2175.03,3,24.96305,121.51254,32.1
6,7,2012.667,34.5,623.4731,7,24.97933,121.53642,40.3
7,8,2013.417,20.3,287.6025,6,24.98042,121.54228,46.7
8,9,2013.5,31.7,5512.038,1,24.95095,121.48458,18.8
9,10,2013.417,17.9,1783.18,3,24.96731,121.51486,22.1


In [33]:
data['house_age_sqaured'] = data['house age'] ** 2

In [40]:
selected_features = ['house_age_squared', 'distance to the nearest MRT station','number of convenience stores']

In [42]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [43]:
# Create a pipeline with feature scaling, polynomial features, and linear regression
model = make_pipeline(StandardScaler(), PolynomialFeatures(degree=2), LinearRegression())

In [44]:
# Train the model using cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mse_cv = -np.mean(cv_scores)

In [45]:
model.fit(X_train, y_train)

In [46]:
# Evaluate the model using test set
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2) Score: {r2}")
print(f"Cross-validated MSE: {mse_cv}")

Mean Squared Error (MSE): 59.89672019723189
R-squared (R2) Score: 0.6226300076752505
Cross-validated MSE: 93.97487103110079


In [47]:
house_age = 32.15
distance_to_the_nearest_mrt_station = 300.56
number_of_convenience_stores = 5


new_house_price_of_unit_area = np.array([[house_age, distance_to_the_nearest_mrt_station, number_of_convenience_stores]])
predicted_price = model.predict(new_house_price_of_unit_area)

print(f"Predicted new house per unit price: {predicted_price[0]}")

Predicted new house per unit price: 41.39934469177408
