In [1]:
%reset -f

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification



In [3]:
df = pd.read_csv('redacted_training_data.csv', sep=';')

def func(s):
    if s == 'Y':
        return 1
    else:
        return 0

df['is_goal'] = df['is_goal'].apply(func)

# shuffle the DataFrame rows
df = df.sample(frac=1)

df = df.rename(columns={'is_goal': 'label'})

In [4]:
base_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(base_model)
data_collator = DataCollatorWithPadding(tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
X = list(df['sentence'])
y = list(df['label'])

X_tokenized = tokenizer(X, padding=True, truncation=True, max_length=512)

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
# X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
# X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


train_dataset = Dataset(X_tokenized, y)
# train_dataset = Dataset(X_train_tokenized, y_train)
# val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

# Define Trainer
# args = TrainingArguments(
#     output_dir='output',
#     evaluation_strategy='steps',
#     eval_steps=250,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=10,
#     learning_rate=2e-5,
#     seed=0,
#     load_best_model_at_end=True,
#     report_to='none'
# )

training_args = TrainingArguments(
    output_dir='feb15',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=11,
    learning_rate=5e-5,
    weight_decay=0.01,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

***** Running training *****
  Num examples = 2210
  Num Epochs = 11
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1529
  Number of trainable parameters = 109483778
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.1314
1000,0.011


Saving model checkpoint to feb15/checkpoint-500
Configuration saved in feb15/checkpoint-500/config.json
Model weights saved in feb15/checkpoint-500/pytorch_model.bin
tokenizer config file saved in feb15/checkpoint-500/tokenizer_config.json
Special tokens file saved in feb15/checkpoint-500/special_tokens_map.json
Saving model checkpoint to feb15/checkpoint-1000
Configuration saved in feb15/checkpoint-1000/config.json
Model weights saved in feb15/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in feb15/checkpoint-1000/tokenizer_config.json
Special tokens file saved in feb15/checkpoint-1000/special_tokens_map.json


In [None]:

# # ----- 3. Predict -----#
# # # Load test data
# # test_data = df
# # X_test = list(test_data['label'])
# # X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# # Create torch dataset
# test_dataset = Dataset(X_val_tokenized)

# # Loading fine-tuned model
# model_path = 'test-trainer/checkpoint-2000'
# model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
# test_trainer = Trainer(model)


# raw_pred, _, _ = test_trainer.predict(test_dataset)
# y_pred = np.argmax(raw_pred, axis=1)

# y_pred = pd.Series(y_pred)
# goal_indices = list(t[t==1].index)

# c = 1
# for i in goal_indices:
#     print(f'Goal{c}')
#     print(X_val[i])
#     print('')
#     c += 1
