In [24]:
from datasets import load_dataset, load_metric, Dataset
# from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding
from transformers import BertTokenizer, BertForSequenceClassification, EarlyStoppingCallback
#BertForSequenceRegression https://github.com/ceshine/pytorch-pretrained-BERT/blob/master/notebooks/Sequence%20Regression%20Model.ipynb
from transformers import Trainer, TrainingArguments, EvalPrediction
import pandas as pd
import torch
import wandb
torch.cuda.empty_cache()

In [2]:
train_data = pd.read_csv('data/si630w22-hw3-train.csv')
dev_data = pd.read_csv('data/si630w22-hw3-dev.csv')
# test_data = pd.read_csv('data\si630w22-hw3-test.public.csv')
q_and_a_data = pd.read_csv('data/si630w22-hw3-data.csv')

In [3]:
#combine the the different ratings
train_df = train_data.groupby('id').mean().round()
train_df = train_df.reset_index()
dev_df = dev_data.groupby('id').mean().round()
dev_df = dev_df.reset_index()

In [33]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

train_df.rename(columns={'id':'question_id'}, inplace = True)
dev_df.rename(columns={'id':'question_id'}, inplace = True)
train_df.rename(columns={'rating':'labels'}, inplace = True)
dev_df.rename(columns={'rating':'labels'}, inplace = True)

q_and_a_data['text'] = q_and_a_data.question_text + '[SEP]' + q_and_a_data.reply_text

merged_train_data = pd.merge(train_df,q_and_a_data[['text','question_id']], on='question_id', how='left')
merged_train_data.dropna(subset=['labels'], inplace = True)
merged_train_data.drop(columns=['question_id'], inplace = True)
#merged_train_data.to_csv('merged_train_data.csv', index=False)

# One hot encode labels
merged_train_data_dict = merged_train_data.to_dict(orient='list')
merged_train_labels_transformed = ohe.fit_transform(merged_train_data.labels.to_numpy().reshape(-1,1))
merged_train_data_dict['ohe_labels'] = merged_train_labels_transformed
train_data_transformed = {'text': merged_train_data_dict['text'], 'labels': merged_train_data_dict['ohe_labels']}

merged_dev_data = pd.merge(dev_df, q_and_a_data[['text','question_id']], on='question_id', how='left')
merged_dev_data.dropna(subset=['labels'], inplace = True)
merged_dev_data.drop(columns=['question_id'], inplace = True)
#merged_dev_data.to_csv('merged_dev_data.csv', index=False)

# one hot encode labels
merged_dev_data_dict = merged_dev_data.to_dict(orient='list')
merged_dev_labels_transformed = ohe.transform(merged_dev_data.labels.to_numpy().reshape(-1,1))
merged_dev_data_dict['ohe_labels'] = merged_dev_labels_transformed
dev_data_transformed = {'text': merged_dev_data_dict['text'], 'labels': merged_dev_data_dict['ohe_labels']}

# merged_test_data = pd.merge(q_and_a_data, test_data, on='question_id')
# merged_test_data.to_csv('data\merged_test_data.csv', index=False)

In [34]:
print(train_data_transformed.keys())
print(train_data_transformed['labels'][0])

dict_keys(['text', 'labels'])
[0. 0. 0. 0. 1.]


[0.0, 0.0, 0.0, 0.0, 1.0]

In [39]:
# could not read in dictionaries like csv files
# filenames = {"train": filename, "dev": filename}
# dataset = load_dataset('csv', filenames)

# instead used simple dictionary object with two separate dataset objects
# MUST FIX FOR DOWNSTREAM TASKS
dataset_map = {"train": Dataset.from_dict(train_data_transformed),
              "dev": Dataset.from_dict(dev_data_transformed),}
print("train", dataset_map["train"])
print("dev", dataset_map["dev"])

train Dataset({
    features: ['text', 'labels'],
    num_rows: 3779
})
dev Dataset({
    features: ['text', 'labels'],
    num_rows: 811
})


In [40]:
#https://discuss.huggingface.co/t/mismatched-target-and-input-size-for-bce-using-multi-label-classification/8706
model = BertForSequenceClassification.from_pretrained("microsoft/MiniLM-L12-H384-uncased",problem_type="multi_label_classification",num_labels=5)

# tokenizer = BertTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
tokenizer = BertTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased", padding = True, truncation=True ,max_length =512)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
tokenized_train_dataset = dataset_map['train'].map(lambda x: tokenizer(x['text'],padding = 'max_length', max_length =512))

  0%|          | 0/3779 [00:00<?, ?ex/s]

In [42]:
tokenized_train_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3779
})

In [44]:
tokenized_train_dataset['labels']

[[0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 1.0, 0.0, 0.0

In [46]:
# tokenized_dataset
len(tokenized_train_dataset['input_ids'][2])
# tokenized_dataset['train'][0]

512

In [48]:
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask','labels'])

dataloader = torch.utils.data.DataLoader(tokenized_train_dataset, batch_size=8)
# next(iter(dataloader))
# tokenized_dataset['train'][0]

In [49]:
# training_args = TrainingArguments("test-trainer")
training_args = TrainingArguments(
    output_dir = 'BERTSeq',
    num_train_epochs = 1,
    
#     do_train = True,
#     do_eval = True,
    evaluation_strategy = 'steps',
    eval_steps = 500,
    learning_rate=1e-4,
#     logging_strategy = 'epoch',
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
#     warmup_steps = 250,
#     weight_decay = 0.01,
    seed =0,
#     metric_for_best_model = 'eval_loss',
    load_best_model_at_end = True
)

In [50]:
tokenized_train_dataset['labels'][0]

tensor([0., 0., 0., 0., 1.])

In [51]:
# more compute metrics
# https://discuss.huggingface.co/t/why-do-i-get-this-error-running-tokenizer/780
# transformers.EvalPrediction https://huggingface.co/docs/transformers/internal/trainer_utils

# from https://theaisummer.com/hugging-face-vit/
# from datasets import load_metric
# metric = load_metric("accuracy")
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     return metric.compute(predictions=predictions, references=labels)

# from https://huggingface.co/transformers/v3.0.2/training.html#trainer
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# METRICS FOR REGRESSION
# from sklearn.metrics import mean_squared_error
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     mse = mean_squared_error(labels, preds)
#     return {
#         'mse': mse
#     }

In [53]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    #eval_dataset=tokenized_dataset["dev"],
    compute_metrics=compute_metrics,
    #compute_mse(tokenized_dataset['dev']['labels'],tokenized_dataset['dev']['labels']),
#     data_collator=data_collator,
#     tokenizer=tokenizer
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [54]:
trainer.train()

# ValueError thread, item 8/13 https://discuss.pytorch.org/t/target-size-torch-size-10-must-be-the-same-as-input-size-torch-size-2/72354/10
# ValueError thread, define num_labels in model https://discuss.huggingface.co/t/mismatched-target-and-input-size-for-bce-using-multi-label-classification/8706

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3779
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 945
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33ms-ryanlee[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss


RuntimeError: stack expects each tensor to be equal size, but got [586] at entry 0 and [512] at entry 1