In [1]:
import argparse
from typing import Dict

import pandas as pd
import torch
import torch_frame
from inferred_stypes import dataset2inferred_stypes
from text_embedder import GloveTextEmbedding
from torch_frame.config.text_embedder import TextEmbedderConfig
from torch_frame.data import Dataset
from torch_frame.gbdt import LightGBM
from torch_frame.typing import Metric
from torch_frame.utils import infer_df_stype

from relbench.data import RelBenchDataset
from relbench.data.task_base import TaskType
from relbench.datasets import get_dataset

import lightgbm as lgb
import shap

In [12]:
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, default="rel-stackex")
parser.add_argument("--task", type=str, default="rel-stackex-engage")
args = parser.parse_args("")

In [13]:
print(args)

Namespace(dataset='rel-stackex', task='rel-stackex-engage')


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# TODO: remove process=True once correct data/task is uploaded.
dataset: RelBenchDataset = get_dataset(name="rel-stackex")
task = dataset.get_task("rel-stackex-engage")

Downloading file 'rel-stackex/db.zip' from 'https://relbench.stanford.edu/staging_data/rel-stackex/db.zip' to 'C:\Users\Shreya Reddy\AppData\Local\relbench\relbench\Cache'.
100%|########################################| 882M/882M [00:00<00:00, 386GB/s]


loading Database object from C:\Users\Shreya Reddy\AppData\Local\relbench\relbench\Cache\rel-stackex\db...
done in 29.56 seconds.


Downloading file 'rel-stackex/tasks/rel-stackex-engage.zip' from 'https://relbench.stanford.edu/staging_data/rel-stackex/tasks/rel-stackex-engage.zip' to 'C:\Users\Shreya Reddy\AppData\Local\relbench\relbench\Cache'.
100%|###############################################| 904k/904k [00:00<?, ?B/s]


In [7]:
train_table = task.train_table
val_table = task.val_table
test_table = task.test_table

In [27]:
dfs: Dict[str, pd.DataFrame] = {}
entity_table = dataset.db.table_dict[task.entity_table]
entity_df = entity_table.df

col_to_stype = dataset2inferred_stypes[args.dataset][task.entity_table]




In [29]:
if entity_table.pkey_col is not None:
    del col_to_stype[entity_table.pkey_col]
for fkey_col in entity_table.fkey_col_to_pkey_table.keys():
    del col_to_stype[fkey_col]

KeyError: 'Id'

In [30]:
if task.task_type == TaskType.BINARY_CLASSIFICATION:
    col_to_stype[task.target_col] = torch_frame.categorical
elif task.task_type == TaskType.REGRESSION:
    col_to_stype[task.target_col] = torch_frame.numerical

In [31]:
for split, table in [
    ("train", train_table),
    ("val", val_table),
    ("test", test_table),
]:
    dfs[split] = table.df.merge(
        entity_df,
        how="left",
        left_on=list(table.fkey_col_to_pkey_table.keys())[0],
        right_on=entity_table.pkey_col,
    )

train_dataset = Dataset(
    df=dfs["train"],
    col_to_stype=col_to_stype,
    target_col=task.target_col,
    col_to_text_embedder_cfg=TextEmbedderConfig(
        text_embedder=GloveTextEmbedding(device=device),
        batch_size=256,
    ),
).materialize()


Embedding raw data in mini-batch: 100%|██████████| 295/295 [00:07<00:00, 39.93it/s]


In [35]:
train_dataset

Dataset()

In [39]:
dfs['train'].sample(n=100)

Unnamed: 0,OwnerUserId,timestamp,contribution,Id,AccountId,DisplayName,Location,ProfileImageUrl,WebsiteUrl,AboutMe,CreationDate
20977,36668,2017-01-01,0,36668,4927354.0,Isa,,,,,2014-08-22 14:07:49.403
25434,46098,2017-01-01,0,46098,5508171.0,Christopher Krapu,,,,,2015-02-19 19:20:23.770
72897,10216,2013-01-02,1,10216,193616.0,JesseBuesking,"San Francisco, CA",,http://www.jessebuesking.com,,2012-08-11 23:18:48.953
4158,6863,2017-01-01,0,6863,275523.0,Michael Burge,"Portland,OR",,http://N/A,,2012-02-15 10:05:30.173
49003,7600,2015-01-02,1,7600,1313419.0,q9f,Berlin,,https://q9f.cc,<p>Libre software and disrupting technologies....,2012-03-25 09:44:44.020
...,...,...,...,...,...,...,...,...,...,...,...
8410,13674,2017-01-01,0,13674,2212185.0,robb,,,,,2013-01-07 00:06:36.937
50379,9849,2015-01-02,1,9849,1681691.0,Emma,,,,,2012-07-23 12:13:41.520
47733,5477,2015-01-02,0,5477,1049077.0,Gray,"Ames, IA",,http://gray.clhn.co,<p>I'm an assistant professor in the Econ depa...,2011-11-24 19:34:01.703
24484,43994,2017-01-01,0,43994,5635841.0,Jean-charles Wijnandts,,,,,2015-01-16 11:19:39.223


In [34]:
tf_train = train_dataset.tensor_frame
tf_val = train_dataset.convert_to_tensor_frame(dfs["val"])
tf_test = train_dataset.convert_to_tensor_frame(dfs["test"])

if task.task_type == TaskType.BINARY_CLASSIFICATION:
    tune_metric = Metric.ROCAUC
else:
    tune_metric = Metric.MAE

Embedding raw data in mini-batch: 100%|██████████| 263/263 [00:11<00:00, 22.88it/s]
Embedding raw data in mini-batch: 100%|██████████| 345/345 [00:09<00:00, 37.38it/s]


In [43]:
tf_train

TensorFrame(
  num_cols=3,
  num_rows=75493,
  numerical (1): ['AccountId'],
  timestamp (1): ['CreationDate'],
  embedding (1): ['AboutMe'],
  has_target=True,
  device='cpu',
)

In [45]:
tf_train.feat_dict

{<stype.numerical: 'numerical'>: tensor([[-1.0000e+00],
         [ 3.0000e+00],
         [ 5.4503e+04],
         ...,
         [ 1.5364e+06],
         [ 2.2146e+06],
         [ 2.7698e+06]]),
 <stype.timestamp: 'timestamp'>: tensor([[[2010,    6,   18,  ...,    6,   55,   26]],
 
         [[2010,    6,   18,  ...,   15,   34,   50]],
 
         [[2010,    6,   18,  ...,   19,    3,   57]],
 
         ...,
 
         [[2012,    6,    8,  ...,   20,    2,   45]],
 
         [[2013,    0,    6,  ...,   16,   20,   54]],
 
         [[2013,    4,   14,  ...,    6,   34,    5]]]),
 <stype.embedding: 'embedding'>: MultiEmbeddingTensor(num_rows=75493, num_cols=1, device='cpu')}

In [16]:
model = LightGBM(task_type=train_dataset.task_type, metric=tune_metric)

model.tune(tf_train=tf_train, tf_val=tf_val, num_trials=10)

[I 2024-03-13 12:45:10,884] A new study created in memory with name: no-name-15a59651-6a69-4260-9eaa-d385bdb8023a
[I 2024-03-13 12:45:41,972] Trial 0 finished with value: 0.5642725999129629 and parameters: {'max_depth': 5, 'learning_rate': 0.002200300912259004, 'num_leaves': 73, 'subsample': 0.9373589641920828, 'colsample_bytree': 0.28198902238055984, 'lambda_l1': 0.004453202991217154, 'lambda_l2': 3.1980979572940273, 'min_data_in_leaf': 35}. Best is trial 0 with value: 0.5642725999129629.
[I 2024-03-13 12:45:43,543] Trial 1 finished with value: 0.5299275849982142 and parameters: {'max_depth': 3, 'learning_rate': 0.001172335449843462, 'num_leaves': 613, 'subsample': 0.961802247972868, 'colsample_bytree': 0.07704601100051792, 'lambda_l1': 4.839182299417388e-05, 'lambda_l2': 0.00031157660096039493, 'min_data_in_leaf': 83}. Best is trial 0 with value: 0.5642725999129629.
[I 2024-03-13 12:46:40,310] Trial 2 finished with value: 0.589699992565635 and parameters: {'max_depth': 10, 'learning_

[2000]	valid_0's auc: 0.557784


[I 2024-03-13 12:56:00,560] Trial 8 finished with value: 0.5578893193725597 and parameters: {'max_depth': 3, 'learning_rate': 0.0034068472105257272, 'num_leaves': 230, 'subsample': 0.5513112072262933, 'colsample_bytree': 0.6971974868928147, 'lambda_l1': 5.3705040784213805e-06, 'lambda_l2': 0.024504869095132175, 'min_data_in_leaf': 10}. Best is trial 5 with value: 0.5919771367917079.
[I 2024-03-13 12:56:02,588] Trial 9 finished with value: 0.5345749492178705 and parameters: {'max_depth': 3, 'learning_rate': 0.002440225719258562, 'num_leaves': 431, 'subsample': 0.6724321506913075, 'colsample_bytree': 0.1485046335503587, 'lambda_l1': 0.00025867084233681777, 'lambda_l2': 6.1245994537609425, 'min_data_in_leaf': 52}. Best is trial 5 with value: 0.5919771367917079.


In [21]:
model_path = "C:\\Users\\Shreya Reddy\\Downloads\\relbenchmain\\examples\\saved_model_lightgbm.lgb"
model = lgb.Booster(model_file=model_path)

In [17]:
pred = model.predict(tf_test=tf_train).numpy()
print(f"Train: {task.evaluate(pred, train_table)}")

pred = model.predict(tf_test=tf_val).numpy()
print(f"Val: {task.evaluate(pred, val_table)}")

pred = model.predict(tf_test=tf_test).numpy()
print(f"Test: {task.evaluate(pred)}")

Train: {'average_precision': 0.4287766703529981, 'accuracy': 0.8362894573006768, 'f1': 0.10461493878142433, 'roc_auc': 0.7010621087834334}
Val: {'average_precision': 0.171850938701806, 'accuracy': 0.888272064295282, 'f1': 0.037687475964619924, 'roc_auc': 0.5919771367917079}
Test: {'average_precision': 0.11700689525475483, 'accuracy': 0.917386223094641, 'f1': 0.030879808332224146, 'roc_auc': 0.5808125564969495}


In [24]:
explainer = shap.TreeExplainer(model)


In [41]:
# Calculate SHAP values. This might take some time for large datasets.
shap_values = explainer.shap_values(dfs['train'].sample(n=100))


ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: timestamp: datetime64[ns], DisplayName: object, Location: object, WebsiteUrl: object, AboutMe: object, CreationDate: datetime64[ns]

In [ ]:
# SHAP Summary Plot
shap.summary_plot(shap_values, tf_train)