In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
import requests
sys.path.append(os.path.abspath('../src'))

from load_data import load_raw
from preprocess import clean_data, select_column, rename, sales_and_residential, drop_na, add_cities, make_zero, geocode, geo_address, one_hot
from train_model import train_random_forest
from evaluate import evaluate_model, compute_shap_local, compute_shap_global


Step 1: Load data and delete the data that is not our target cities

In [None]:
import pandas as pd

path = r"C:\Users\ktlee\Downloads\all_transactions.csv"
df_raw = load_raw(path)

In [None]:
df_renamed = rename(df_raw)
df_selected = select_column(df_renamed)
df_filtered = sales_and_residential(df_selected)
df_dropped = drop_na(df_filtered)
df_with_city = add_cities(df_dropped)
df_with_zero = make_zero(df_with_city)



In [None]:
#Externally download the street data
df_with_zero['Full addr'] = (
        df_with_zero['Street number'].astype(str).str.strip() + ' ' +
        df_with_zero['Street name'].astype(str).str.strip() + ', ' +
        df_with_zero['Postal code'].astype(str).str.strip() + ' ' +
        df_with_zero['Municipality'].astype(str).str.strip()
    )
df = df_with_zero.copy()
df.drop(columns=['Street number', 'Street name', 'Postal code', 'Municipality'], inplace=True)
#df.to_csv(r"C:\Users\ktlee\Downloads\clean_data_new.csv", index = False)

In [None]:
#Add back the longtitude and latitude
df_address = pd.read_csv(r"C:\Users\ktlee\Downloads\clean_address_dict.csv")
df_address.columns = ['Full addr', 'Longitude', 'Latitude']
#df_address.head
#df_with_zero.columns
df_geocoded = df_with_zero.merge(df_address, on = 'Full addr', how = 'left')

df_clean_geo = df_geocoded[
    (df_geocoded['Longitude'].notna() & df_geocoded['Latitude'].notna()) &
    (df_geocoded['Longitude'] != 0) & (df_geocoded['Latitude'] != 0)
]


In [21]:
#One-hot coding
df_all_eng = one_hot(df_clean_geo)
df_all_eng.dtypes

df_all_eng.to_csv(r"C:\Users\ktlee\Downloads\df_all_eng.csv")



Step 2: For each city, pre-process and train

In [25]:
df_all_eng = pd.read_csv('../data/df_all_eng.csv')

In [27]:
df = df_all_eng
cities = df['City'].unique()

# For now, use sample for testing
#df_sample = df.head(1000).copy()  # Remove later
#df_sample = geocode(df_sample)   # Test geocoding

metrics_table = dict()

training_city = ['Lyon']

for city in cities:
    if city not in training_city or False:
        continue
    df_city = df[df['City'] == city].copy()
    #print(df_city['Property value'])
    X, y = clean_data(df_city)

    case_model = 3 #<-- Change this

    #Case 3: Only print output, without machine learning
    if case_model == 3:
        print(f"{city}: {df_city.count()}")

    #Case 1: Let the program to find best model
    if case_model == 1:
        model, params, X_test, y_test = train_random_forest(X, y)

    #Case 2: Self define the model
    if case_model == 2:
        param_grid = {
                'n_estimators': [2500],
                'max_depth': [32],
                'bootstrap': [False]
            }
        model, params, X_test, y_test = train_random_forest(X, y, param_grid)

    # Evaluate
    if case_model in [1,2]:
        metrics = evaluate_model(model, X_test, y_test)
        metrics_table[city] = metrics
        print(f"City {city}'s model training is done.")

q1: 182340.0, q3: 440000.0
Lyon: Unnamed: 0                                      16052
Property value                                  15733
Street number                                   16052
Street name                                     16052
Postal code                                     16052
Municipality                                    16052
Number of rooms                                 16052
Land area                                       16052
Type of land use                                 1536
Actual built surface                            16052
Number of lots                                  16052
City                                            16052
Full addr                                       16052
Longitude                                       16052
Latitude                                        16052
Year                                            16052
Quarter                                         16052
Type of property_Apartment                      1

Step 4: SHAP analysis

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X)