In [None]:
cd ../..

## Feature Ideas

- User features:
    - `locale`
    - `age`
    - `gender`
    - `days_on_app`
    - `location`
    - `timezone`
    - `num_friends`
- For monthly windows between 1 and 5 months ago calculate the following features:
    - `num_invited` (as per the `event_attendees` table)
    - `num_yes`
    - `num_no`
    - `num_maybe`
    - `avg_event_start_hour`
    - `modal_event_dow`
    - `num_invites` (as per the `event_interest` table)
    - `num_interested`
    - `num_not_interested`
    - `num_invited_and_interested`
    - `num_invited_and_not_interested`

In [None]:
import duckdb
from torch_frame.utils import infer_df_stype

import utils

conn = duckdb.connect('event/event.db')
%load_ext sql
%sql conn --alias duckdb
%config SqlMagic.displaycon=False
%config SqlMagic.autopandas=True

In [None]:
with open('event/user-attendance/feats.sql', 'r') as f:
    # run once with train_labels and once with val_labels
    template = f.read()

# create train, val and test features
# takes 1 - 5 mins
for s in ['train', 'val', 'test']:
    print(f'Creating {s} table')
    query = utils.render_jinja_sql(template, dict(set=s, subsample=0))
    conn.sql(query)
    print(f'{s} table created')

In [None]:
utils.validate_feature_tables('user_attendance', conn)

In [None]:
%%sql train_df <<
from user_attendance_train_feats

In [None]:
infer_df_stype(train_df)

In [None]:
utils.feature_summary_df(train_df.sample(20_000), 'target', classification=False)

## Feature Importances

In [None]:
import numpy as np
import shap
from torch_frame import TaskType, stype
from torch_frame.data import Dataset
from torch_frame.gbdt import LightGBM

from inferred_stypes import task_to_stypes
from train_gbdt import TASK_PARAMS

TASK = 'rel-event-user-attendance'

task_params = TASK_PARAMS[TASK]

In [None]:
%%sql val_df <<
select * from user_attendance_val_feats;

In [None]:
col_to_stype = task_to_stypes[TASK].copy()
del col_to_stype['title']
del col_to_stype['last_review_summary']
val_tf = Dataset(
    val_df,
    col_to_stype=col_to_stype,
    target_col=task_params['target_col'],
).materialize().tensor_frame

In [None]:
gbdt = LightGBM(task_type=task_params['task_type'])
gbdt.load(f'models/{TASK}_lgbm.json')
pred = gbdt.predict(tf_test=val_tf).numpy()

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(gbdt.model)

sample = np.random.randint(0, len(val_tf), size=10_000)

val_arr, _, _ = gbdt._to_lightgbm_input(val_tf[sample])
shap_values = explainer.shap_values(val_arr, pred[sample])

# TODO verify
feat_names = val_tf.col_names_dict.get(stype.categorical, []) + val_tf.col_names_dict[stype.numerical]

shap.summary_plot(shap_values, val_arr, plot_type='violin', max_display=30, feature_names=feat_names)