## Importing necessary libraries

In [24]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import pandas as pd

sb.set()
warnings.filterwarnings('ignore')

#### Importing dataset

In [25]:
data = pd.read_csv("cleaned-Housing.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
0,0,3/1/2012,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,06 TO 10,45,Improved,1986,250000.0
1,1,3/1/2012,ANG MO KIO,2 ROOM,510,ANG MO KIO AVE 8,01 TO 05,44,Improved,1980,265000.0
2,2,3/1/2012,ANG MO KIO,3 ROOM,610,ANG MO KIO AVE 4,06 TO 10,68,New Generation,1980,315000.0
3,3,3/1/2012,ANG MO KIO,3 ROOM,474,ANG MO KIO AVE 10,01 TO 05,67,New Generation,1984,320000.0
4,4,3/1/2012,ANG MO KIO,3 ROOM,604,ANG MO KIO AVE 5,06 TO 10,67,New Generation,1980,321000.0


### Linear Regression Model Function

In [26]:
def linear_regression(data, variable):
    # Fill NA values in 'resale_price' and drop rows where 'floor_area_sqm' is NA
    data['resale_price'].fillna(0, inplace=True)
    data_clean = data.dropna(subset=[variable])
    
    # Prepare the data
    X = data_clean[[variable]]
    y = data_clean['resale_price']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict and evaluate the model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print the performance
    print(f'Linear Regression using {variable}:')
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'R-squared Score: {r2}\n')

### Linear Regression for 'floor_area_sqm' against 'resale_price'

In [27]:
linear_regression(data, 'floor_area_sqm')

Linear Regression using floor_area_sqm:
Mean Squared Error (MSE): 49101189726.02469
R-squared Score: 0.009338818928007364



### Linear Regression for 'lease_commence_date' against 'resale_price'

In [28]:
linear_regression(data, 'lease_commence_date')

Linear Regression using lease_commence_date:
Mean Squared Error (MSE): 49311022363.00488
R-squared Score: 0.0051052545451859555



##### The MSE for 'floor_area_sqm' and 'lease_commence_date' are both high, indicating that the models may not be fitting the data well.

##### The R^2 value for 'floor_area_sqm' is slightly higher than that for 'lease_commence_date', suggesting that 'floor_area_sqm' is a slightly better predictor of 'resale_price' than 'lease_commence_date', but overall, both models explain a very small portion of the variance in 'resale_price'.

### Ridge Regression Model Function

In [29]:
def ridge_regression(data, variable):
    # Select the feature and target
    X = data[[variable]]
    y = data['resale_price']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the Ridge Regression model
    ridge_reg_model = Ridge(alpha=1.0)

    # Fit the model on the training data
    ridge_reg_model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred_ridge = ridge_reg_model.predict(X_test)

    # Evaluate the model performance
    mse_ridge = mean_squared_error(y_test, y_pred_ridge)
    r2_ridge = r2_score(y_test, y_pred_ridge)

    # Print out the model performance metrics for Ridge Regression
    print(f'Ridge Regression using {variable}:')
    print(f'Mean Squared Error (MSE): {mse_ridge}')
    print(f'R-squared Score: {r2_ridge}\n')

### Ridge Regression Model for 'resale_price' with 'floor_area_sqm'

In [30]:
ridge_regression(data, 'floor_area_sqm')

Ridge Regression using floor_area_sqm:
Mean Squared Error (MSE): 49101189724.921715
R-squared Score: 0.009338818950260785



### Ridge Regression Model for 'resale_price' with 'lease_commence_date'

In [31]:
ridge_regression(data, 'lease_commence_date')

Ridge Regression using lease_commence_date:
Mean Squared Error (MSE): 49311022364.418015
R-squared Score: 0.005105254516674651



##### The R^2 value for both variables are very close to 0. This indicates that only about 0.9% of the variance in 'resale_price' is being explained by 'floor_area_sqm' and  only about 0.5% of the variance in 'resale_price' is being explained by 'lease_commence_date' which is even less than what 'floor_area_sqm' could explain in Ridge Regression model. This suggests that 'floor_area_sqm' could be a better predictor.

##### Hence we move on to try Bayesian Ridge model

### Bayesian Ridge Regression Model Function

In [36]:
def bayesian_ridge_regression(data, variable):
    # Fill NA values in 'resale_price' and drop rows where the feature is NA
    data['resale_price'].fillna(0, inplace=True)
    data_clean = data.dropna(subset=[variable])
    
    # Prepare the data
    X = data_clean[[variable]]
    y = data_clean['resale_price']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the model
    model = BayesianRidge()
    model.fit(X_train, y_train)
    
    # Predict and evaluate the model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print the performance
    print(f'Bayesian Ridge Regression using {variable}:')
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'R-squared Score: {r2}\n')

### Bayesian Ridge Regression Model for 'resale_price' with 'floor_area_sqm'

In [33]:
bayesian_ridge_regression(data, 'floor_area_sqm')

Bayesian Ridge Regression using floor_area_sqm:
Mean Squared Error (MSE): 49101131295.81488
R-squared Score: 0.009339997810657197



### Bayesian Ridge Regression Model for 'resale_price' with 'lease_commence_date'

In [34]:
bayesian_ridge_regression(data, 'lease_commence_date')

Bayesian Ridge Regression using lease_commence_date:
Mean Squared Error (MSE): 49311064858.39827
R-squared Score: 0.005104397161956875



##### The Bayesian Ridge model is incorporating prior distributions over the weights and automatically tuning its regularization parameters. Despite this, the predictive power does not significantly improve compared to the standard Ridge Regression, as indicated by the similar MSE and R^2 values.


In [47]:
average_resale_price = data['resale_price'].mean()

print(f"The average resale price is: {average_resale_price}")

The average resale price is: 143201.11295236502


##### It is important to note that eventhough the MSE seems high it is acceptable because the reatail prices usually vary widely. Housing prices, such as those for HDB flats, can range significantly. An MSE of tens of millions might seem large, but if the resale prices range from tens of thousands to multiple millions, the MSE might be relatively small when considered as a percentage of the price range. The square root of the MSE gives you the Root Mean Squared Error (RMSE), which is on the same scale as the prices themselves. For example, if the RMSE is 70,000, this means the average prediction error is 70,000. If the average housing price is around 500,000 or more, a 70,000 error might be a reasonable margin for certain applications, like a quick market estimate.

##### Thus, we moved on to try other complex models like Gradient Boosting and Random Forest

### Gradient Boosting Regressor Model

In [48]:
df = data

# Encode categorical variables
label_encoders = {}
categorical_cols = ['flat_type', 'storey_range', 'flat_model']
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Split the data into features and target variable
X = df[['flat_type', 'storey_range', 'flat_model', 'floor_area_sqm']]
y = df['resale_price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Gradient Boosting Regressor model
gb_regressor = GradientBoostingRegressor()

# Train the model
gb_regressor.fit(X_train, y_train)

# Make predictions
y_pred = gb_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 44368038553.20795


##### Compared to linear models, this model is more robust and captures the interaction between different varibles like 'floor_area_sqm' and 'flat_type' effectively. It provided more flexibility in considering more varibles both categorical and numerical.
##### We can see thet the MSE also decreased in using the model and eventhough it is still high, the absolute error would be a less percentage of high 'resale_price' value.

### Gradient Boosting Regressor Model on Dataset after removing outliers

In [39]:
# Define the lower and upper quantile thresholds
lower_quantile = 0.25  # 5th percentile
upper_quantile = 0.75  # 95th percentile

# Filter out the outliers based on the resale price column
lower_threshold = df['resale_price'].quantile(lower_quantile)
upper_threshold = df['resale_price'].quantile(upper_quantile)
cleaned_df = df[(df['resale_price'] >= lower_threshold) & (df['resale_price'] <= upper_threshold)]

# Split the cleaned data into features and target variable
X = cleaned_df[['flat_type', 'storey_range', 'flat_model', 'floor_area_sqm']]
y = cleaned_df['resale_price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Gradient Boosting Regressor model
gb_regressor = GradientBoostingRegressor()

# Train the model
gb_regressor.fit(X_train, y_train)

# Make predictions
y_pred = gb_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error after removing outliers:", mse)

Mean Squared Error after removing outliers: 5464626336.649794


##### We can see that the MSE has increased after removing the outliers which could be beacuse of the following reasons:

##### The removed outliers may have been influential points that were actually well-predicted by the model. If these points were not errors but valid extreme values, the model might have been leveraging these data points to better fit the overall trend. Without them, the model might not capture the full range of the data as well.

##### Tree-based models like Gradient Boosting, can be sensitive to the removal of data points. The decision boundaries or split points might change significantly when outliers are removed, leading to a poorer fit to the remaining data.

### Random Forest Regressor Model

In [46]:
from sklearn.ensemble import RandomForestRegressor

def buildRandForest(prediction,predictors):
    # Split prediction and predictors data
    y = pd.DataFrame(data[prediction])
    x = pd.DataFrame(data[predictors])
    x_trn,x_tst,y_trn,y_tst = train_test_split(x,y,test_size=0.25)
    
    print(prediction)
    print(x)
    print(y)
    
    # Train decision tree
    rf = RandomForestRegressor()
    rf.fit(x_trn, y_trn)
    
    # Predict Response corresponding to Predictors
    y_trn_pred = rf.predict(x_trn)
    y_tst_pred = rf.predict(x_tst)
    
    # Check the Goodness of Fit (on Train Data)
    print("Goodness of Fit of Model \tTrain Dataset")
    print("Explained variance\t:", rf.score(x_trn,y_trn))
    print("Mean squared error \t:", mean_squared_error(y_trn, y_trn_pred))
    print()

    # Check the Goodness of Fit (on Test Data)
    print("Goodness of Fit of Model \tTest Dataset")
    print("Explained variance \t:", rf.score(x_tst,y_tst))
    print("Mean squared error \t:", mean_squared_error(y_tst, y_tst_pred))
    print()
    
    # Plot (two) trees
#     for i in range(min(2,len(rf.estimators_))):
#         plt.figure(figsize=(30,15), dpi=300)
#         plot_tree(rf.estimators_[i])
#         plt.show()    
#==============================================================#

data.dropna(inplace=True)
buildRandForest('resale_price',['floor_area_sqm','lease_commence_date'])

resale_price
        floor_area_sqm lease_commence_date
0                   45          1986-01-01
1                   44          1980-01-01
2                   68          1980-01-01
3                   67          1984-01-01
4                   67          1980-01-01
...                ...                 ...
167393             131          1987-01-01
167394             122          1987-01-01
167395             122          1987-01-01
167396             146          1987-01-01
167397             146          1988-01-01

[167398 rows x 2 columns]
        resale_price
0           250000.0
1           265000.0
2           315000.0
3           320000.0
4           321000.0
...              ...
167393           0.0
167394           0.0
167395           0.0
167396           0.0
167397           0.0

[167398 rows x 1 columns]


DTypePromotionError: The DType <class 'numpy.dtypes.Int64DType'> could not be promoted by <class 'numpy.dtypes.DateTime64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.DateTime64DType'>)