In [480]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [481]:
# Load your dataset
df = pd.read_excel('kl_property_data.xlsx')

In [482]:
print(df.head)

<bound method NDFrame.head of                               Location      Price Rooms  Bathrooms  Car Parks  \
0                   KLCC, Kuala Lumpur  1250000.0   2+1        3.0        2.0   
1      Damansara Heights, Kuala Lumpur  6800000.0     6        7.0        NaN   
2                Dutamas, Kuala Lumpur  1030000.0     3        4.0        2.0   
3                 Cheras, Kuala Lumpur        NaN   NaN        NaN        NaN   
4            Bukit Jalil, Kuala Lumpur   900000.0   4+1        3.0        2.0   
...                                ...        ...   ...        ...        ...   
53878            Bangsar, Kuala Lumpur  5100000.0   5+1        4.0        NaN   
53879            Bangsar, Kuala Lumpur  5000000.0     5        4.0        NaN   
53880            Bangsar, Kuala Lumpur  5500000.0   5+1        4.0        NaN   
53881        Wangsa Maju, Kuala Lumpur   480000.0     3        2.0        NaN   
53882            Setapak, Kuala Lumpur   540000.0     4        2.0        3.0  

In [483]:
# Preprocess the data
def preprocess_data(df):
    # Clean the 'Price' column
    df['Price'] = df['Price'].replace('[\$,]', '', regex=True).astype(float)

    # Extract numerical value from 'Size'
    df['Size'] = df['Size'].str.extract('(\d+)').astype(float)

    return df

df = preprocess_data(df)

In [484]:
# Separate features and target variable
X = df[['Size']]
y = df['Price']

In [485]:
# Handle missing values in target variable y
y = y.fillna(y.mean())

In [486]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [487]:
# Define transformers
numeric_features = ['Size']
numeric_transformer = SimpleImputer(strategy='mean')

In [488]:
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])


In [489]:
# Create and train the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

model.fit(X_train, y_train)


In [490]:
# Make predictions
predictions = model.predict(X_test)

In [491]:
# Actual and Predicted Output
output_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print(output_df)

          Actual     Predicted
1812   1300000.0  1.011069e+06
28700  2100000.0  3.116014e+06
21611   720000.0  1.011069e+06
14128  1345000.0  1.011069e+06
15507  1550000.0  2.885654e+06
...          ...           ...
31696   400000.0  1.011069e+06
7225   5200000.0  2.692181e+06
16828  1495888.0  7.869218e+05
31959   315000.0  5.876675e+05
37456  1200000.0  1.011069e+06

[10777 rows x 2 columns]
