In [1]:
import os
import sys
import zipfile
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

%matplotlib inline

sys.path.append('../')
from src.data_preprocess import DataPreprocessor

pd.set_option('display.max_colwidth', 2000)

%load_ext autoreload
%autoreload 2

## Download dataset

In [38]:
if not os.listdir('../input'):
    !kaggle competitions download -c cs5228-2022-semester-1-final-project -p ../input
    Dataset = "cs5228-2022-semester-1-final-project"
    with zipfile.ZipFile(f"../input/{Dataset}.zip","r") as z:
        z.extractall("../input")

In [39]:
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        if filename.endswith('csv'):
            print(os.path.join(dirname, filename))

../input\example-submission.csv
../input\test.csv
../input\train.csv
../input\auxiliary-data\sg-commerical-centres.csv
../input\auxiliary-data\sg-mrt-stations.csv
../input\auxiliary-data\sg-primary-schools.csv
../input\auxiliary-data\sg-secondary-schools.csv
../input\auxiliary-data\sg-shopping-malls.csv
../input\auxiliary-data\sg-subzones.csv


### train.csv

In [40]:
train_df = pd.read_csv('../input/train.csv')
train_df.head(2)

Unnamed: 0,listing_id,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,...,furnishing,available_unit_types,total_num_units,property_details_url,lat,lng,elevation,subzone,planning_area,price
0,122881,hdb flat for sale in 866 yishun street 81,sembawang / yishun (d27),866 yishun street 81,hdb 4 rooms,,1988.0,3.0,2.0,1115,...,unspecified,,116.0,https://www.99.co/singapore/hdb/866-yishun-street-81-adxawp85buupmsq7gwdjverc,1.414399,103.837196,0,yishun south,yishun,514500.0
1,259374,hdb flat for sale in 506b serangoon north avenue 4,hougang / punggol / sengkang (d19),hdb-serangoon estate,hdb,99-year leasehold,1992.0,4.0,2.0,1575,...,unspecified,"1, 2, 3, 4, 5, 6 br",,https://www.99.co/singapore/hdb/hdbserangoon-estate-demrpm6ryc3l9buf846erprb,1.372597,103.875625,0,serangoon north,serangoon,995400.0


In [41]:
print(train_df.shape)
display(train_df.isnull().sum())
display(train_df.describe())

(20254, 21)


listing_id                  0
title                       0
address                     0
property_name               0
property_type               0
tenure                   1723
built_year                922
num_beds                   80
num_baths                 434
size_sqft                   0
floor_level             16746
furnishing                  0
available_unit_types     1441
total_num_units          5652
property_details_url        0
lat                         0
lng                         0
elevation                   0
subzone                   113
planning_area             113
price                       0
dtype: int64

Unnamed: 0,listing_id,built_year,num_beds,num_baths,size_sqft,total_num_units,lat,lng,elevation,price
count,20254.0,19332.0,20174.0,19820.0,20254.0,14602.0,20254.0,20254.0,20254.0,20254.0
mean,550763.206428,2010.833695,3.122931,2.643542,1854.364,376.253938,1.434282,103.855356,0.0,5228263.0
std,258874.420108,15.822803,1.281658,1.473835,13543.43,346.882474,1.558472,3.593441,0.0,277974800.0
min,100043.0,1963.0,1.0,1.0,0.0,4.0,1.239621,-77.065364,0.0,0.0
25%,326279.0,2000.0,2.0,2.0,807.0,106.0,1.307329,103.806576,0.0,819000.0
50%,551397.0,2017.0,3.0,2.0,1119.0,296.0,1.329266,103.841552,0.0,1680000.0
75%,774044.5,2023.0,4.0,3.0,1528.0,561.0,1.372461,103.881514,0.0,3242400.0
max,999944.0,2028.0,10.0,10.0,1496000.0,2612.0,69.486768,121.023232,0.0,39242430000.0


In [42]:
fig = px.histogram(train_df, x='price')
fig.show()

## Excessive outliers

In [43]:
train_df_clean = DataPreprocessor.remove_price_outlier(train_df)
fig = px.histogram(train_df_clean, x='price')
fig.show()

## Duplicated records
* same attribute records same price
* same attribute records different price => take average (+/- 200,000)

In [45]:
train_df_clean.shape

(20252, 21)

In [46]:
train_df_clean = DataPreprocessor.remove_duplicates(train_df_clean)
train_df_clean.shape

(16131, 21)

### test.csv

In [47]:
test_df = pd.read_csv('../input/test.csv')
test_df.head(1)

Unnamed: 0,listing_id,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,available_unit_types,total_num_units,property_details_url,lat,lng,elevation,subzone,planning_area
0,777912,1 bed condo for sale in the gazania,17 how sun drive,the gazania,condo,freehold,2022.0,1.0,1.0,463,,unfurnished,"studio, 1, 2, 3, 4, 5 br",250.0,https://www.99.co/singapore/condos-apartments/the-gazania,1.344334,103.87869,0,upper paya lebar,serangoon


In [48]:
print(test_df.shape)
display(test_df.isnull().sum())
display(test_df.describe())

(7000, 20)


listing_id                 0
title                      0
address                    2
property_name              0
property_type              0
tenure                   637
built_year               358
num_beds                  35
num_baths                152
size_sqft                  0
floor_level             5844
furnishing                 0
available_unit_types     520
total_num_units         1900
property_details_url       0
lat                        0
lng                        0
elevation                  0
subzone                   33
planning_area             33
dtype: int64

Unnamed: 0,listing_id,built_year,num_beds,num_baths,size_sqft,total_num_units,lat,lng,elevation
count,7000.0,6642.0,6965.0,6848.0,7000.0,5100.0,7000.0,7000.0,7000.0
mean,551687.994143,2010.823999,3.089591,2.624854,1709.027,373.181373,1.41664,103.853102,0.0
std,259038.092508,15.713629,1.276983,1.466597,1860.113894,333.976046,1.34329,3.408832,0.0
min,100108.0,1963.0,1.0,1.0,68.0,6.0,1.239621,-77.065364,0.0
25%,327927.0,2000.0,2.0,2.0,797.0,111.0,1.307189,103.806576,0.0
50%,549475.0,2017.0,3.0,2.0,1119.0,298.0,1.329266,103.842241,0.0
75%,775229.0,2023.0,4.0,3.0,1528.0,561.0,1.370798,103.879948,0.0
max,999981.0,2028.0,10.0,10.0,27500.0,2612.0,69.486768,121.023232,0.0


In [49]:
test_df_clean = test_df.copy()

#### Data fields
* listing_id - unique identifier of the property listing
* title - title of the property listing (e.g., "2 bed condo for sale in 35 gilstead")
* address - address of the property (e.g., "124 punggol walk", "11 sengkang east avenue")
* property_name - name of the property (e.g., "redhill rise", "klimt cairnhill")
* property_type - type of the property (e.g., "condo", "hdb 2 rooms", "landed")
* tenure - tenure of the property (e.g., "freehold", "99-year leasehold")
* built_year - year when the property was was built (e.g., 2014, 2021)
* num_beds - number of bedrooms (e.g., 1, 2, 3)
* num_baths - number of bathrooms (e.g., 1, 2, 3)
* size_sqft - floor area in square feet (e.g., 807, 657, 1628)
* floor_level - information about the floor level of the property (e.g., "high", "low")
* furnishing - information whether the property is furnished (e.g., "fully", "partial")
* available_unit_types - list of all types of units available in the property complex (e.g., "studio, 3, 4, 5 br")
* total_num_units - total number of units in the property complex (e.g., 115, 200)
* property_details_url - URL linking to more information about the property complex
* lat - latitude of property (e.g., 1.328805)
* lng - longitude of property (e.g., 103.74502)
* elevation - elevation of the property in meters (e.g., 10)
* subszone - subzone of block containing the flat in meter (e.g., "blangah rise", "marymount")
* planning_area - planning area of block containing the flat (e.g., "woodlands", "bukit merah")
* price - sales price in SGD

In [50]:
train_df_clean.head(1)

Unnamed: 0,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,...,available_unit_types,total_num_units,property_details_url,lat,lng,elevation,subzone,planning_area,listing_id,price
0,1 bed condo for sale in 10 evelyn,10 evelyn road,10 evelyn,Condo,freehold,2022.0,1.0,1.0,495,,...,"studio, 1, 2, 3 br",56.0,https://www.99.co/singapore/condos-apartments/10-evelyn,1.31629,103.840576,0,moulmein,novena,667980.0,1424800.0


In [51]:
for col in train_df_clean.columns[1:]:
    """ list all columns' histograms
    """
    print(col)
    # fig = px.histogram(train_df_clean, x=col)
    # fig.show()

address
property_name
property_type
tenure
built_year
num_beds
num_baths
size_sqft
floor_level
furnishing
available_unit_types
total_num_units
property_details_url
lat
lng
elevation
subzone
planning_area
listing_id
price


In [52]:
drop_attributes = {'listing_id'}

## auxiliary data

In [53]:
for dirname, _, filenames in os.walk('../input/auxiliary-data/'):
    for filename in filenames:
        if filename.endswith('csv'):
            print(os.path.join(dirname, filename))
            aux_df = pd.read_csv(os.path.join(dirname, filename))
            display(aux_df.head(1))

../input/auxiliary-data/sg-commerical-centres.csv


Unnamed: 0,name,type,lat,lng,subzone,planning_area
0,Central Business District,CR,1.286768,103.854529,clifford pier,downtown core


../input/auxiliary-data/sg-mrt-stations.csv


Unnamed: 0,code,line,name,opening_year,lat,lng,subzone,planning_area
0,cc1,cc,dhoby ghaut,2010,1.298912,103.846293,dhoby ghaut,museum


../input/auxiliary-data/sg-primary-schools.csv


Unnamed: 0,name,lat,lng,subzone,planning_area
0,Admiralty Primary School,1.442941,103.800345,woodlands east,serangoon


../input/auxiliary-data/sg-secondary-schools.csv


Unnamed: 0,name,lat,lng,subzone,planning_area
0,Admiralty Secondary School,1.445912,103.802908,woodlands east,woodlands


../input/auxiliary-data/sg-shopping-malls.csv


Unnamed: 0,name,lat,lng,subzone,planning_area
0,10 AM,1.275568,103.863591,marina south,marina south


../input/auxiliary-data/sg-subzones.csv


Unnamed: 0,name,area_size,population,planning_area
0,ang mo kio town centre,0.3169,4810,ang mo kio


## 1. title
* no missing value
* containing following attributes (overlapped with other fields):
    1. property_type
    2. for sale
    3. location
* usage:
    1. not useful for modeling
    2. sanity check for other attributes (address, )
    3. impute for other attributes
        * num_beds: 80 NaN
        * subzone/ planning_area: 113 NaN

In [54]:
train_df_clean = DataPreprocessor.preprocess_title(train_df_clean)
test_df_clean = DataPreprocessor.preprocess_title(test_df_clean)

In [55]:
display(train_df_clean.loc[:, ['title_property_type','property_type']].head(3))
display(train_df_clean.loc[:, ['title_n_beds','num_beds']].head(3)) ## if hdb flat,can get from property_type
display(train_df_clean.loc[:, ['title_address','address']].head(3))

Unnamed: 0,title_property_type,property_type
0,condo,Condo
1,condo,Condo
2,condo,apartment


Unnamed: 0,title_n_beds,num_beds
0,1,1.0
1,1,1.0
2,1,1.0


Unnamed: 0,title_address,address
0,10 evelyn,10 evelyn road
1,10 evelyn,10 evelyn road
2,10 evelyn,10 evelyn road


In [56]:
train_df_clean.title_n_beds.unique()

array(['1', '10', '2', '3', '4', '5', '6', '7', '8', '9', 'hdb flat'],
      dtype=object)

In [57]:
drop_attributes.add('title')
drop_attributes.add('title_property_type')
drop_attributes.add('title_n_beds')
drop_attributes.add('title_address')

## 2. address
* no missing value
* not useful itself as one attribute for model
* maybe useful for impute subzone/ planning_area: 113 NaN

In [58]:
train_df_clean['address'].unique()

array(['10 evelyn road', '10 shelford road', '18 woodsville close', ...,
       '32 middle road', '112 punggol walk', 'serangoon terrace'],
      dtype=object)

In [69]:
drop_attributes.add('address')

## lat & lng

### Notes: wrong records -> some fuckers sold house all the way to Philippines, Norway & 35Washington, DC !

In [59]:
train_df_clean['lat'] = train_df_clean['lat'].astype(float)
train_df_clean['lng'] = train_df_clean['lng'].astype(float)
# fig = px.histogram(train_df_clean[(zscore(train_df_clean['lng']) > -1) & (zscore(train_df_clean['lng']) < 1)], x='lng')
fig = px.histogram(train_df_clean, x='lat')
fig.show()
fig = px.histogram(train_df_clean, x='lng')
fig.show()

In [68]:
def fu(sub_df):
    # print(sub_df['address'].unique())
    # print(f"lat/address = {len(sub_df['lat'].unique())}, lng/address = {len(sub_df['lng'].unique())}")
    # if ((len(sub_df['lat'].unique()) > 1) | (len(sub_df['lng'].unique()) >1)):
    if((sub_df[~((sub_df['lat'] > 1) & (sub_df['lat'] < 2))].shape[0]>0) & (sub_df[~((sub_df['lng'] > 103) & (sub_df['lng'] < 105))].shape[0]>0)):
        print(sub_df['address'].unique())
        print(sub_df['title_address'].unique())
        print(f"lats: {sub_df['lat'].unique()}")
        print(f"lngs: {sub_df['lng'].unique()}")
train_df_clean.groupby('address').apply(fu)

['1 tessensohn road']
['1953']
lats: [14.4848138]
lngs: [121.0232316]
['15 farrer drive']
['pollen & bleu']
lats: [69.4867678]
lngs: [20.1844341]
['17 farrer drive']
['pollen & bleu']
lats: [69.4867678]
lngs: [20.1844341]
['38 lorong 32 geylang']
['ness']
lats: [38.9427759]
lngs: [-77.06536425]
['5 jalan mutiara']
['m5']
lats: [14.4848138]
lngs: [121.0232316]


In [71]:
train_df_clean = DataPreprocessor.preprocess_lat_lng(train_df_clean)
fig = px.histogram(train_df_clean, x='lat')
fig.show()
fig = px.histogram(train_df_clean, x='lng')
fig.show()

## 3. property_name
* no missing value
* not useful itself as one attribute for model
* distinct property name can help to impute data (e.g, built year) => but too noisy!

In [None]:
# train_df_clean['property_name_clean'] = train_df_clean['property_name'].str.split('@').str[0].str.strip()
# print(str(train_df_clean['property_name'].unique().tolist()[:100]))
# print(str(train_df_clean['property_name_clean'].unique().tolist()[:100]))
# print(len(train_df_clean['property_name'].unique()))
# print(len(train_df_clean['property_name_clean'].unique()))

In [None]:
drop_attributes.add('property_name')

## 4. property_type
* useful as ordinal cat (possible ranking)

### dirty records
1. small letter
2. hdb vs hdb {n} rooms vs Hdb Executive
3. good class bungalow vs bungalow?
3. different type of house, condo
4. walk-up? land only?

In [None]:
fig = px.box(train_df_clean, x="property_type", y="price")
fig.show()

In [None]:
train_df_clean = DataPreprocessor.preprocess_property_type(train_df_clean)#[['title_property_type','property_type','property_type_info']]
test_df_clean = DataPreprocessor.preprocess_property_type(test_df_clean, test=True)

In [None]:
fig = px.box(train_df_clean.sort_values('price'), x="property_type_clean", y="price")
fig.update_xaxes(categoryorder='array',categoryarray = train_df_clean.groupby('property_type_clean').median().sort_values('price').index.to_list())
fig.show()
fig = px.scatter(train_df_clean.groupby('property_type_cat').median().sort_values('price').reset_index(), x="property_type_cat", y="price")
fig.show()

In [None]:
drop_attributes.add('property_type')
drop_attributes.add('property_type_clean')

## tenure
* missing 1595

In [None]:
train_df_clean['tenure'].isnull().sum()

In [None]:
fig = px.box(train_df_clean, x="tenure", y="price")
fig.show()

In [None]:
# train_df_clean.groupby('tenure').count()
fig = px.histogram(train_df_clean, x='tenure')
fig.show()

![alt text](../ref/lease.png "tenure type")

In [None]:
train_df_clean = DataPreprocessor.preprocess_tenure(train_df_clean)
test_df_clean = DataPreprocessor.preprocess_tenure(test_df_clean, test=True)


fig = px.box(train_df_clean.sort_values('price'), x="tenure", y="price")
fig.update_xaxes(categoryorder='array',categoryarray = train_df_clean.groupby('tenure').median().sort_values('price').index.to_list())
fig.show()
fig = px.scatter(train_df_clean.groupby('tenure_cat').median().sort_values('price').reset_index(), x="tenure_cat", y="price")
fig.show()

In [None]:
drop_attributes.add('tenure')

## built_year
* missing 789

In [None]:
train_df_clean['built_year'].isnull().sum()

### Imputation


#### same property has different built year (too noisy)

In [None]:
df_ = train_df_clean.copy()
train_df_clean = DataPreprocessor.preprocess_built_year(train_df_clean, uncertain=True)
df_ = train_df_clean.copy()
train_df_clean['built_year'].isnull().sum()

In [None]:
def fu(sub_df):
    if np.isnan(np.sum(sub_df['built_year'].unique())):
        print(sub_df['built_year'].unique())
        display(sub_df[['block_number', 'property_type_clean','lat', 'lng','built_year']])
        print("="*40)

# df_ = DataPreprocessor.preprocess_built_year(train_df_clean, uncertain=True)
df_.groupby(['lat', 'lng', 'property_type_clean'], dropna=False).apply(fu)

In [None]:
train_df_clean = DataPreprocessor.preprocess_built_year(train_df_clean, False)
test_df_clean = DataPreprocessor.preprocess_built_year(test_df_clean, False)
train_df_clean['built_year'].isnull().sum()

In [None]:
fig = px.box(train_df_clean, x="built_year", y="price")
fig.show()

In [None]:
drop_attributes.add('block_number')
drop_attributes.add('lat_lowres')
drop_attributes.add('lng_lowres')

## num_beds
train_missing = 70

In [None]:
print(train_df_clean['num_beds'].isnull().sum())
train_df_clean = DataPreprocessor.preprocess_num_beds(train_df_clean)
test_df_clean = DataPreprocessor.preprocess_num_beds(test_df_clean)
print(train_df_clean['num_beds'].isnull().sum())

In [None]:
fig = px.box(train_df_clean, x="num_beds", y="price")
fig.show()

## num_baths
train_missing = 306

In [None]:
print(train_df_clean['num_baths'].isnull().sum())

In [None]:
df_ = DataPreprocessor.preprocess_num_baths(train_df_clean, False)

In [None]:
print(df_['num_baths'].isnull().sum())

In [None]:
fig = px.box(train_df_clean, x="num_baths", y="price")
fig.show()

In [None]:
train_df_clean = DataPreprocessor.preprocess_num_baths(train_df_clean, False)
test_df_clean = DataPreprocessor.preprocess_num_baths(test_df_clean, False)

## size_sqft
train_missing = 0

In [None]:
print(train_df_clean['size_sqft'].isnull().sum())

In [None]:
train_df_clean = DataPreprocessor.preprocess_size_sqft(train_df_clean, True)
test_df_clean = DataPreprocessor.preprocess_size_sqft(test_df_clean, True)

In [None]:
fig = px.histogram(train_df_clean, x='size_sqft')
fig.show()


In [None]:
fig = px.scatter(train_df_clean, x="size_sqft", y="price", hover_data = list(train_df_clean.columns))
fig.show()

## floor_level
train_missing = 12777

In [None]:
print(train_df_clean['floor_level'].isnull().sum())

In [None]:
print(train_df_clean["floor_level"].unique())
train_df_clean["property_type_clean"].unique()

In [None]:
df_ = DataPreprocessor.preprocess_floor_level(train_df_clean)

In [None]:
df_["floor & total"] = df_["total_level_cat"].astype(str) + "_" + df_["floor_level_cat"].astype(str)
fig = px.box(df_, x="floor & total", y="price")
fig.show()
fig = px.scatter(df_.groupby('floor & total').mean().sort_values('price').reset_index(), x="floor & total", y="price")
fig.show()

In [None]:
train_df_clean = DataPreprocessor.preprocess_floor_level(train_df_clean)
test_df_clean = DataPreprocessor.preprocess_floor_level(test_df_clean, test=True)
drop_attributes.add('floor_level')

In [None]:
# # train_df_clean['floor_level'].unique()
# def display_m(sub):
#     FLOOR_LEVEL_TYPE = ["condo", "apartment", "executive condo", "hdb", "hdb executive"]
#     if sub['property_type_clean'].unique()[0] in FLOOR_LEVEL_TYPE:
#         print(sub['property_type_clean'].unique())
#         print(sub["floor_level"].unique())
#         display(sub[["floor_level", "elevation"]])
#         print('='*50)
# #     if ('nan' in sub['floor_level'].astype(str).unique()) & (sub['property_type_clean'].unique()[0]=='apartment'):
# #         display(sub[['property_type_clean','floor_level']])

# df_.groupby(['property_type_clean'], dropna=False).apply(display_m)

## furnishing
train_missing = 0

In [None]:
print(train_df_clean['furnishing'].isnull().sum())
print(train_df_clean["furnishing"].unique())

In [None]:
df_ = DataPreprocessor.preprocess_furnishing(train_df_clean)
fig = px.box(df_, x="furnishing_cat", y="price")
fig.show()
fig = px.scatter(df_.groupby('furnishing_cat').mean().sort_values('price').reset_index(), x="furnishing_cat", y="price")
fig.show()

In [None]:
train_df_clean = DataPreprocessor.preprocess_furnishing(train_df_clean)
test_df_clean = DataPreprocessor.preprocess_furnishing(test_df_clean)
drop_attributes.add('furnishing')

## available_unit_types

In [None]:
df_ = train_df_clean.copy()
df_ = DataPreprocessor.preprocess_available_unit_types(train_df_clean)
df_[["number_of_types_available", "has_studio", "min_br_available", "max_br_available"]]

In [None]:
# df_ = DataPreprocessor.preprocess_available_unit_types(train_df_clean)
fig = px.box(df_, x="max_br_available", y="price")
fig.show()
fig = px.scatter(df_.groupby('max_br_available').mean().sort_values('price').reset_index(), x="max_br_available", y="price")
fig.show()

In [None]:
train_df_clean = DataPreprocessor.preprocess_available_unit_types(train_df_clean)
test_df_clean = DataPreprocessor.preprocess_available_unit_types(test_df_clean)
drop_attributes.add('available_unit_types')

## total_num_units

In [None]:
# df_ = DataPreprocessor.preprocess_available_unit_types(train_df_clean)
fig = px.box(df_, x="total_num_units", y="price")
fig.show()
fig = px.scatter(df_.groupby('total_num_units').mean().sort_values('price').reset_index(), x="total_num_units", y="price")
fig.show()

## property_details_url, elevation
### drop

In [None]:
drop_attributes.add('property_details_url')
drop_attributes.add('elevation')

## subszone
### missing

In [None]:
print(train_df_clean['subzone'].isnull().sum())

In [None]:
# subzone_impute_dict = {
#     "1953": "farrer park",
#     "m5": "tanglin", 
#     "ness" "gui":
#     "pollen & bleu"
# }

In [None]:
drop_attributes.add('subzone')

## planning_area
### missing

In [None]:
print(train_df_clean['planning_area'].isnull().sum())

In [None]:
df_ = DataPreprocessor.preprocess_planning_area(train_df_clean)
print(df_['planning_area'].isnull().sum())
df_['planning_area'].unique()

In [None]:
fig = px.box(df_, x="planning_area_cat", y="price")
fig.show()
fig = px.scatter(df_.groupby('planning_area_cat').mean().sort_values('price').reset_index(), x="planning_area_cat", y="price")
fig.show()

In [None]:
train_df_clean = DataPreprocessor.preprocess_planning_area(train_df_clean)
test_df_clean = DataPreprocessor.preprocess_planning_area(test_df_clean)
drop_attributes.add('planning_area')

In [None]:

# def fu(sub_df):
#     if 'nan' in sub_df['subzone'].astype(str).unique():
#         print(sub_df['subzone'].unique())
#         print(sub_df['title_address'].unique())
#         display(sub_df[['block_number', 'lat', 'lng','subzone', 'planning_area', 'title_address', 'property_type_clean']])
#         print("="*40)

# # df_ = DataPreprocessor.preprocess_built_year(train_df_clean, uncertain=True)
# df_.groupby(['lat','lng'], dropna=False).apply(fu)

In [None]:
# geo_resolution = 3

# df_['lat_res'] = df_['lat'].round(geo_resolution)
# df_['lng_res'] = df_['lng'].round(geo_resolution)
# temp_df1 = df_.groupby(['lat_res', 'lng_res'], dropna=False).apply(lambda x: x['built_year'].unique()).reset_index().rename(columns={0: 'built_year'})
# display(temp_df1[(temp_df1['built_year'].apply(lambda x: 'nan' in str(x))) & (temp_df1['built_year'].apply(lambda x: len(x) == 2))].shape)
# display(temp_df1[(temp_df1['built_year'].apply(lambda x: 'nan' in str(x))) & (temp_df1['built_year'].apply(lambda x: len(x) == 2))].head())

# df_ = df_.groupby(['lat_res', 'lng_res'], dropna=False).apply(DataPreprocessor.impute_built_year_unify)
# temp_df1 = df_.groupby(['lat_res', 'lng_res'], dropna=False).apply(lambda x: x['built_year'].unique()).reset_index().rename(columns={0: 'built_year'})
# display(temp_df1[(temp_df1['built_year'].apply(lambda x: 'nan' in str(x))) & (temp_df1['built_year'].apply(lambda x: len(x) == 2))].shape)
# # print(df_['built_year'].isnull().sum())
# print("="*50)

# display(temp_df1[(temp_df1['built_year'].apply(lambda x: 'nan' in str(x))) & (temp_df1['built_year'].apply(lambda x: len(x) > 2))].head())

* listing_id - unique identifier of the property listing
* title - title of the property listing (e.g., "2 bed condo for sale in 35 gilstead")
* address - address of the property (e.g., "124 punggol walk", "11 sengkang east avenue")
* property_name - name of the property (e.g., "redhill rise", "klimt cairnhill")
* property_type - type of the property (e.g., "condo", "hdb 2 rooms", "landed")
* tenure - tenure of the property (e.g., "freehold", "99-year leasehold")
* built_year - year when the property was was built (e.g., 2014, 2021)
* num_beds - number of bedrooms (e.g., 1, 2, 3)
* num_baths - number of bathrooms (e.g., 1, 2, 3)
* size_sqft - floor area in square feet (e.g., 807, 657, 1628)
* floor_level - information about the floor level of the property (e.g., "high", "low")
* furnishing - information whether the property is furnished (e.g., "fully", "partial")
* available_unit_types - list of all types of units available in the property complex (e.g., "studio, 3, 4, 5 br")
* total_num_units - total number of units in the property complex (e.g., 115, 200)
* property_details_url - URL linking to more information about the property complex
* lat - latitude of property (e.g., 1.328805)
* lng - longitude of property (e.g., 103.74502)
* elevation - elevation of the property in meters (e.g., 10)
* subszone - subzone of block containing the flat in meter (e.g., "blangah rise", "marymount")
* planning_area - planning area of block containing the flat (e.g., "woodlands", "bukit merah")
* price - sales price in SGD

# Consolidation

In [None]:
train_df_model = train_df_clean.drop(drop_attributes, axis=1,inplace=False).reset_index(drop=True)
test_df_clean = test_df_clean.drop(drop_attributes, axis=1,inplace=False).reset_index(drop=True)

In [None]:
train_df_model.head()

In [None]:
train_df_model.head()

In [None]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from hyperopt import fmin, hp, tpe, STATUS_OK, STATUS_FAIL, Trials
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

train_df_model_nonNa = train_df_model.dropna()

X_nonNa = train_df_model_nonNa.drop('price', axis=1,inplace=False).astype(float)
y_nonNa = train_df_model_nonNa['price'].astype(float)
X = train_df_model.drop('price', axis=1,inplace=False).astype(float)
y = train_df_model['price'].astype(float)

In [None]:
X

In [None]:
import warnings
warnings.filterwarnings('ignore')
# XGB parameters
xgb_reg_params={
    'max_depth': hp.quniform("max_depth", 5, 50, 5),
#     'gamma': hp.uniform ('gamma', 1,9),
#     'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
#     'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': 1000,
}
xgb_fit_params = {
    'eval_metric': 'rmse',
    'early_stopping_rounds': 10,
    'verbose': False
}
xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
xgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))
xgb_para['score_func' ] = lambda y, pred: r2_score(y, pred)

# Random Forest
random_forest_reg_params={
    'n_estimators':hp.uniform('n_estimators',100,500),
    'max_depth':hp.uniform('max_depth',5,50),
    'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
    'min_samples_split':hp.uniform('min_samples_split',2,6)}
random_forest_fit_params = {
    'eval_metric': 'rmse',
    'early_stopping_rounds': 10,
    'verbose': False
}
random_forest_para = dict()
random_forest_para['reg_params'] = random_forest_reg_params
random_forest_para['fit_params'] = random_forest_fit_params
random_forest_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))
random_forest_para['score_func' ] = lambda y, pred: r2_score(y, pred)



class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials

    def xgb_reg(self, para):
        space = para['reg_params']
        reg = xgb.XGBRegressor(
            n_estimators =space['n_estimators'], 
            max_depth = int(space['max_depth']),
#             gamma = space['gamma'],
#             reg_alpha = int(space['reg_alpha']),
            min_child_weight=int(space['min_child_weight']),
            colsample_bytree=int(space['colsample_bytree']),
            n_jobs = -1
        )
        return self.train_reg(reg, para)

    def random_forest_reg(self, para):
        space = para['reg_params']
        reg = RandomForestRegressor(
            n_estimators=int(space['n_estimators']),
            max_depth=int(space['max_depth']),
            min_samples_leaf=int(space['min_samples_leaf']),
            min_samples_split=int(space['min_samples_split']),
            n_jobs=-1
        )
        return self.train_reg(reg, para)

#     def ctb_reg(self, para):
#         reg = ctb.CatBoostRegressor(**para['reg_params'])
#         return self.train_reg(reg, para)

    def train_reg(self, reg, para):
        reg.fit(self.x_train, self.y_train)
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        score = para['score_func'](self.y_test, pred)
        return {'loss': loss, 'score': score,'status': STATUS_OK}

X_train, X_test, y_train, y_test = train_test_split(X, y)
obj = HPOpt(X_train, X_test, y_train, y_test)
xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=100)

X_train, X_test, y_train, y_test = train_test_split(X_nonNa, y_nonNa)
obj = HPOpt(X_train, X_test, y_train, y_test)
random_forest_opt = obj.process(fn_name='random_forest_reg', space=random_forest_para, trials=Trials(), algo=tpe.suggest, max_evals=100)

In [None]:
xgb_opt

In [None]:
space = xgb_opt[0]
regressor = xgb.XGBRegressor(
    n_estimators =180, 
    max_depth = int(space['max_depth']), 
#     gamma = space['gamma'],
#     reg_alpha = int(space['reg_alpha']),
    min_child_weight=int(space['min_child_weight']),
    colsample_bytree=int(space['colsample_bytree']))
X_train, X_test, y_train, y_test = train_test_split(X, y)
regressor.fit(X_train, y_train)
# regressor

In [None]:
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(regressor, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores

In [None]:
random_forest_opt

In [None]:
space = random_forest_opt[0]
regressor = RandomForestRegressor(
    n_estimators=int(space['n_estimators']),
    max_depth=int(space['max_depth']),
    min_samples_leaf=int(space['min_samples_leaf']),
    min_samples_split=int(space['min_samples_split']),
    n_jobs=-1
)
X_train, X_test, y_train, y_test = train_test_split(X_nonNa, y_nonNa)
regressor.fit(X_train, y_train)

In [None]:
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(regressor, X_nonNa, y_nonNa, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores

In [None]:
from explainerdashboard import RegressionExplainer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_nonNa, y_nonNa)
regressor.fit(X_train, y_train)

explainer = RegressionExplainer(regressor, X_test, y_test)

In [None]:
from explainerdashboard import ExplainerDashboard
ExplainerDashboard(explainer).run()