# Plaidbot Training

This notebook is for training a plaidbot model

Change runtime to use a GPU

## Setup

In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r '/content/drive/MyDrive/Colab Notebooks/Plaidbot/v2/app_lib' .

In [None]:
# Imports
from typing import List
from datetime import datetime

from app_lib.options.model_options import ModelOptions
from app_lib.options.prepro_options import PreproOptions
from app_lib.src.pick_users.run_pick_users import run_pick_users
from app_lib.src.select_data.run_select_data import run_select_data
from app_lib.src.train.run_training import run_training
from app_lib.src.train.run_prediction import run_prediction

## Options

In [None]:
prepro_opts = PreproOptions()

# File path options
prepro_opts.user_filename:str = 'users.json' # File from slack where user info is stored 
prepro_opts.message_folder:str = '/content/drive/MyDrive/Colab Notebooks/Plaidbot/messages' # Folder containing slack message folders
prepro_opts.selected_folders: List[str] = [ 
    'general',
    # add desired folders...
]

# Filtering Options
prepro_opts.min_date: datetime  = datetime(2018,1,1) # Earliest message date
prepro_opts.min_num_words: int = 3 # Minimum number of words in a message
prepro_opts.max_messages: int = 100000 # Maximum number of messages to train on

In [None]:
model_opts = ModelOptions()
model_opts.bert_model_name:str = 'distilbert-base-uncased' # Bert model name
model_opts.max_len = 150 # Max characters per message
model_opts.val_size = 0.2 # Proportion of messages used for validation
model_opts.num_epochs = 2 # Number of training epochs
model_opts.batch_size = 8 # Batch size for data loader
model_opts.device = 'cuda:0' # Device used for training. Use 'cuda:0' for training, 'cpu' for deploying
model_opts.learning_rate = 2e-5 # Learning rate of optimizer
model_opts.saved_model_name = 'username/model-name' # Saved model name for HuggingFace
model_opts.auth_token = 'hugging-face-auth-token-goed-here' # HuggingFace Access token

## Pick users and Data

In [None]:
a, b = run_pick_users(prepro_opts)

In [None]:
prepro_opts.user_id_int_dict = a
model_opts.user_int_name_dict = b

In [None]:
messages = run_select_data(prepro_opts)
# May get a set of training and test messages here...

## Train

In [None]:
model = run_training(messages, model_opts)

In [None]:
test_messages = [ 'This is a test' ]
preds = run_prediction(model, test_messages, model_opts)
print(preds)

## Save the model

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Maybe I should also save the tokenizer?
base_model = model.get_inner_model().get_base_model()
base_model.push_to_hub(model_opts.saved_model_name)

In [None]:
test_messages = ['This is a test']
preds = run_prediction(None, test_messages, model_opts)
print(preds)

In [None]:
## TODO: Add an accuracy/metrics calc here tht I can use to verify performance of the loaded one
## Would require splitting a test set from the data...