In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error, mean_squared_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [2]:
df = pd.read_csv("../data/Rumah.comdataset_v5_modified.csv")

In [3]:
y = df.Price
X = df.drop(['Price'], axis=1)

In [4]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, 
                                                            train_size=0.8, test_size=0.2)

In [5]:
low_cardinality_cols = ['Jakarta Division']
high_cardinality_cols = ['Street Address', 'Certificate']

In [6]:
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

In [7]:
my_cols = low_cardinality_cols + high_cardinality_cols+ numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [8]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Jakarta Division', 'Street Address', 'Certificate']


In [14]:
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid, i):
    model = RandomForestRegressor(n_estimators=2360, min_samples_split=2, 
                                  min_samples_leaf=2, max_features=1.0, 
                                  max_depth=90, bootstrap=True, random_state=i)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds)
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape


In [10]:
# Function to optimize hyperparameters for Random Forest
#Return the parameters that give the best MAE
def optimize_rf(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(random_state=0)
    n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2360, num = 10)]
    max_features = [1.0, 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                     'max_features': max_features,
                        'max_depth': max_depth,
                        'min_samples_split': min_samples_split,
                        'min_samples_leaf': min_samples_leaf,
                        'bootstrap': bootstrap}
    #The target metric is MAE
    scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
    rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring=scorer)
    rf_random.fit(X_train, y_train)
    preds = rf_random.predict(X_valid)
    mape = mean_absolute_percentage_error(y_valid, preds)
    return mape, rf_random.best_params_

In [11]:
# Make copy to avoid changing original data 
mix_X_train = X_train.copy()
mix_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
mix_X_train[high_cardinality_cols] = ordinal_encoder.fit_transform(X_train[high_cardinality_cols])
mix_X_valid[high_cardinality_cols] = ordinal_encoder.transform(X_valid[high_cardinality_cols])

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(mix_X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(mix_X_valid[low_cardinality_cols]))

#To remove warnings
OH_cols_train.columns = OH_cols_train.columns.astype('str')
OH_cols_valid.columns = OH_cols_valid.columns.astype('str')

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = mix_X_train.drop(low_cardinality_cols, axis=1)
num_X_valid = mix_X_valid.drop(low_cardinality_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [None]:
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid, 0))

In [32]:
#Running the AI model on 50 tries with different random_states to make sure that the 
#result is reliable (fulfills the target of MAPE < 20%)
MAPE_results = []
for i in range(50):
    MAPE = score_dataset(OH_X_train, OH_X_valid, y_train, y_valid, i)
    MAPE_results.append(MAPE)
    

In [33]:
#Printing the average of those 50 tries
print(sum(MAPE_results)/len(MAPE_results))

0.18645043471759842


In [34]:
model = RandomForestRegressor(n_estimators=2360, min_samples_split=2, 
                                  min_samples_leaf=2, max_features=1.0, 
                                  max_depth=90, bootstrap=True, random_state=i)
model.fit(OH_X_train, y_train)

RandomForestRegressor(max_depth=90, max_features=1.0, min_samples_leaf=2,
                      n_estimators=2360, random_state=49)

In [35]:
filename = "/Users/an-uigeun/Desktop/무제 폴더/random_forest.pickle"
pickle.dump(model, open(filename, "wb"))