In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error, mean_squared_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder

In [16]:
df = pd.read_csv("../data/Rumah.comdataset_v4.csv")
df = df.drop(columns=['Property Link', 'ID'])
df['Listing Area'] = df['Listing Area'].str.replace(' m²', '')
df = df.astype({'Listing Area':'int64'})
#Print the shape of the dataset before removing outliers

In [17]:
#Before we start, perform outlier detection and removal (removing all rows with outlier values) for numerical columns
outliers = pd.DataFrame()


numerical_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
#Keep removing outliers until there are no more outliers (run the loop 5 times)
for i in range(10):
    #Check if there are any outliers
    for col in numerical_cols:
        # Perform outlier detection using Interquartile Range
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
#         # Before removing outliers, put the outliers in a separate dataframe
#         outliers = outliers.append(df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)])
        # Remove outliers
        df = df[(df[col] >= Q1 - 1.5*IQR) & (df[col] <= Q3 + 1.5*IQR)]



#Print the shape of the dataset after outlier removal
index1 = df[df.Price <= 100000000].index
df = df.drop(index1)
print(df.shape)

(12907, 7)


In [18]:
y = df.Price
X = df.drop(['Price'], axis=1)

In [19]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, 
                                                            train_size=0.8, test_size=0.2)

In [20]:
test_df = pd.concat([X_valid_full, y_valid], axis=1)

Unnamed: 0,Street Address,Bed,Bath,Listing Area,Certificate,Jakarta Division,Price
11411,Cengkareng,3,3,120,SHM - Sertifikat Hak Milik,WEST,2.380000e+09
15224,Cilincing,3,2,150,SHM - Sertifikat Hak Milik,NORTH,1.800000e+09
11163,Kembangan,4,3,210,SHM - Sertifikat Hak Milik,WEST,2.200000e+09
22661,Kelapa Gading,4,2,135,SHM - Sertifikat Hak Milik,NORTH,3.000000e+09
1737,Kemayoran,2,2,36,SHM - Sertifikat Hak Milik,CENTRAL,4.056400e+08
...,...,...,...,...,...,...,...
8551,Cilincing,3,2,105,SHM - Sertifikat Hak Milik,NORTH,1.500000e+09
9053,Kelapa Gading,4,2,132,SHM - Sertifikat Hak Milik,NORTH,3.650000e+09
8291,Pantai Indah Kapuk,3,3,240,PPJB - Perjanjian Pengikatan Jual Beli,NORTH,4.500000e+09
8058,Pluit,3,3,200,SHM - Sertifikat Hak Milik,NORTH,4.000000e+09


In [21]:
test_df.to_csv("../data/test_case.csv")

In [22]:
train_df = pd.concat([X_train_full, y_train], axis=1)
train_df.to_csv("../data/train_case.csv")

In [23]:
# cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
# X_train_full.drop(cols_with_missing, axis=1, inplace=True)
# X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

In [24]:
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]
high_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() >= 10 and 
                        X_train_full[cname].dtype == "object"]
low_cardinality_cols, high_cardinality_cols

(['Jakarta Division'], ['Street Address', 'Certificate'])

In [25]:
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [26]:
my_cols = low_cardinality_cols + high_cardinality_cols+ numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [27]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Jakarta Division', 'Street Address', 'Certificate']


In [28]:
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds)
#     r2 = r2_score(y_valid, preds)
    #Get the number of samples and features in the validation set
#     n_samples, n_features = X_train.shape
#     #Calculate the degree of freedoms for residuals
#     dof_residuals = n_samples - n_features - 1
    #Calculate the adjusted r2 score
#     adj_r2 = 1 - (1 - r2) * (dof_residuals / (n_samples - 1))
    #calculate the root mean squared error
#     rmse = np.sqrt(mean_squared_error(y_valid, preds))
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape, preds


In [30]:
# Using approach 1: Drop Categorical Variables
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE, RMSE, Adjusted R2, MPAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE, RMSE, Adjusted R2, MPAE from Approach 1 (Drop categorical variables):
(0.38450278634418084, array([2.66636517e+09, 2.63380579e+09, 4.09342260e+09, ...,
       4.63993269e+09, 4.02079801e+09, 1.55569924e+09]))


In [34]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(label_X_train, y_train)
preds = model.predict(label_X_valid)
mape = mean_absolute_percentage_error(y_valid, preds)
print(mape)

0.31537109730740914


In [35]:
import pickle
filename = "../data/random_forest.pickle"

# save model
pickle.dump(model, open(filename, "wb"))


In [36]:
# load model
loaded_model = pickle.load(open(filename, "rb"))

# you can use loaded model to compute predictions
y_predicted = loaded_model.predict(label_X_valid)