In [30]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.abspath('../src'))

from load_data import load_raw
from preprocess import clean_data, select_column, rename, sales_and_residential, drop_na
from train_model import train_random_forest
from evaluate import evaluate_model, compute_shap_local, compute_shap_global


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Step 1: Load data and delete the data that is not our target cities

In [None]:
import pandas as pd

path = r"C:\Users\ktlee\Downloads\all_transactions.csv"
df_raw = load_raw(path)



In [None]:
df_renamed = rename(df_raw)
df_selected = select_column(df_renamed)
df_filtered = sales_and_residential(df_selected)
df_dropped = drop_na(df_filtered)

In [29]:
print(df_filtered['Type of property'].unique())

['Appartement' 'Maison']


Step 2: For each city, pre-process and train

In [None]:
cities = df['City'].unique()

metrics_table = dict()

for city in cities:
    df_city = df[df['City'] == city].copy()
    X, y = clean_data(df_city)

    case_model = 1 #<-- Change this

    #Case 1: Let the program to find best model
    if case_model == 1:
        model, params, X_test, y_test = train_random_forest(X, y)

    #Case 2: Self define the model
    if case_model == 2:
        param_grid = {
                'n_estimators': [2500],
                'max_depth': [32],
                'bootstrap': [False]
            }
        model, params, X_test, y_test = train_random_forest(X, y, param_grid)


    metrics = evaluate_model(model, X_test, y_test)
    metrics_table[city] = metrics
    print(f"City {city}'s model training is done.")

Step 4: SHAP analysis

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X)