In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
import requests
sys.path.append(os.path.abspath('../src'))

from load_data import load_raw
from preprocess import clean_data, select_column, rename, sales_and_residential, drop_na, add_cities, make_zero, geocode, geo_address
from train_model import train_random_forest
from evaluate import evaluate_model, compute_shap_local, compute_shap_global


Step 1: Load data and delete the data that is not our target cities

In [None]:
import pandas as pd

path = r"C:\Users\ktlee\Downloads\all_transactions.csv"
df_raw = load_raw(path)



In [None]:
df_renamed = rename(df_raw)
df_selected = select_column(df_renamed)
df_filtered = sales_and_residential(df_selected)
df_dropped = drop_na(df_filtered)
df_with_city = add_cities(df_dropped)
df_with_zero = make_zero(df_with_city)



In [9]:
df = df_with_zero.copy()
df['Full addr'] = (
        df['Street number'].astype(str).str.strip() + ' ' +
        df['Street name'].astype(str).str.strip() + ', ' +
        df['Postal code'].astype(str).str.strip() + ' ' +
        df['Municipality'].astype(str).str.strip()
    )
df.drop(columns=['Street number', 'Street name', 'Postal code', 'Municipality'], inplace=True)
df.to_csv(r"C:\Users\ktlee\Downloads\clean_data.csv", index = False)

In [6]:
cycle = 20
geo_results = []
total = len(df_with_zero)
i = 0
if True:
    while i < total and i < 100: #the i < 100 is just a stop condition
        df_i = df_with_zero.iloc[i:i+cycle].copy()
        geo_results.append(geocode(df_i))
        i = i + cycle
        print(f"Now finished {i}/{total}")

df_geo = pd.concat(geo_results, ignore_index=True)


Now finished 20/304590
Now finished 40/304590
Now finished 60/304590
Now finished 80/304590
Now finished 100/304590


In [None]:
print(geo_address("Rue de Rivoli 75001 Paris"))

In [None]:
print(df_geo)
#df_dropped.to_csv("comp3314_semi_clean_csv")

Step 2: For each city, pre-process and train

In [None]:
df = df_with_city
cities = df['City'].unique()

metrics_table = dict()

for city in cities:
    df_city = df[df['City'] == city].copy()
    X, y = clean_data(df_city)

    case_model = 1 #<-- Change this

    #Case 1: Let the program to find best model
    if case_model == 1:
        model, params, X_test, y_test = train_random_forest(X, y)

    #Case 2: Self define the model
    if case_model == 2:
        param_grid = {
                'n_estimators': [2500],
                'max_depth': [32],
                'bootstrap': [False]
            }
        model, params, X_test, y_test = train_random_forest(X, y, param_grid)


    metrics = evaluate_model(model, X_test, y_test)
    metrics_table[city] = metrics
    print(f"City {city}'s model training is done.")

Step 4: SHAP analysis

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X)