<a href="https://colab.research.google.com/github/smallppgirl/csdn_course_ai/blob/master/rf_hdb_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import requests

In [5]:
# Load the HDB resale flat dataset
result = requests.get(
    "https://data.gov.sg/api/action/datastore_search?resource_id="
    "f1765b54-a209-4718-8d38-a39237f502b3&limit=16888&sort=month%20desc").json()

data_dict = result["result"]["records"]
df = pd.DataFrame.from_dict(data_dict)
df = df.head(16888)

In [6]:
df['remaining_lease'] \
    = df['remaining_lease'].str.replace("\syear[s]?[\s]?", '.', regex=True)\
    .replace("\smonth[s]?", '', regex=True)\
    .astype('float')

In [8]:
# Split the dataset into features and target variable
X = df.drop(['resale_price', "_id"], axis=1)
y = df['resale_price']

In [9]:
# Define the columns to be encoded and scaled
cat_cols = ['flat_model', 'town', 'storey_range', 'street_name']
num_cols = ['remaining_lease', 'floor_area_sqm']


In [10]:
# Define the column transformer to apply the encoders and scaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', StandardScaler(), num_cols)])

In [11]:
# Initialize the random forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Define the pipeline to apply the preprocessor and the random forest regressor
pipe = Pipeline([('preprocessor', preprocessor), ('rf', rf)])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train the model on the training data
pipe.fit(X_train, y_train)


In [13]:
# Make predictions on the test data
y_pred = pipe.predict(X_test)


In [14]:
# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean squared error:', mse)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

Mean squared error: 1419753887.8307362
R-squared: 0.9501935599050586


In [30]:
# Create a new dataframe for the new data
new_data = pd.DataFrame({
    'flat_model': ['Type S1'],
    'town': ['CENTRAL AREA'],
    'storey_range': ['43 TO 45'],
    'street_name': ['CANTONMENT RD'],
    'remaining_lease': [87],
    'floor_area_sqm': [94]
})
prediction = pipe.predict(new_data)
print('The predicted resale price is:', prediction[0])

The predicted resale price is: 1264621.6666666665


In [34]:
# Create a new dataframe for the new data
new_data = pd.DataFrame({
    'flat_model': ['Improved'],
    'town': ['YISHUN'],
    'storey_range': ['10 TO 12'],
    'street_name': ['YISHUN ST 72'],
    'remaining_lease': [64.09],
    'floor_area_sqm': [122]
})
prediction = pipe.predict(new_data)
print('The predicted resale price is:', prediction[0])

The predicted resale price is: 593710.8333333333
