In [1]:
%reset -f

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification



In [3]:
# df = pd.read_csv('redacted_training_data.csv', sep=';')

# def func(s):
#     if s == 'Y':
#         return 1
#     else:
#         return 0

# df['is_goal'] = df['is_goal'].apply(func)

# # shuffle the DataFrame rows
# df = df.sample(frac=1)

# df = df.rename(columns={'is_goal': 'label'})

In [4]:
# train_df = pd.read_csv('train_dataset_4500.csv')
# test_df = pd.read_csv('test_dataset_208.csv')
train_df = pd.read_csv('train_dataset_1800.csv')
test_df = pd.read_csv('test_dataset_208_new.csv')

In [5]:
base_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(base_model)
data_collator = DataCollatorWithPadding(tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:


# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
# X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
# X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_X = list(train_df['sentence'])
train_y = list(train_df['label'])
test_X = list(test_df['sentence'])
test_y = list(test_df['label'])

train_X_tokenized = tokenizer(train_X, padding=True, truncation=True, max_length=512)
test_X_tokenized = tokenizer(test_X, padding=True, truncation=True, max_length=512)

train_dataset = Dataset(train_X_tokenized, train_y)
test_dataset = Dataset(test_X_tokenized, test_y)

In [8]:
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

# Define Trainer
# args = TrainingArguments(
#     output_dir='output',
#     evaluation_strategy='steps',
#     eval_steps=250,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=10,
#     learning_rate=2e-5,
#     seed=0,
#     report_to='none'
# )

# training_args = TrainingArguments(
#     output_dir='mar9',
#     evaluation_strategy='steps',
#     eval_steps=25,
#     save_strategy='steps',
#     save_steps=25,
#     logging_steps=25,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=6,
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     report_to='none'
# )

training_args = TrainingArguments(
    output_dir='mar9',
    evaluation_strategy='steps',
    eval_steps=10,
    save_strategy='steps',
    save_steps=10,
    logging_steps=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

***** Running training *****
  Num examples = 1800
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 678
  Number of trainable parameters = 109483778
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,0.6954,0.672596,0.543269,1.0,0.086538,0.159292
20,0.645,0.622467,0.629808,0.575419,0.990385,0.727915
30,0.5884,0.521643,0.807692,0.766667,0.884615,0.821429
40,0.5226,0.476878,0.8125,0.751938,0.932692,0.832618
50,0.5015,0.459415,0.778846,0.845238,0.682692,0.755319
60,0.4283,0.445664,0.817308,0.775,0.894231,0.830357
70,0.5026,0.438141,0.822115,0.75969,0.942308,0.841202
80,0.3229,0.396349,0.841346,0.838095,0.846154,0.842105
90,0.3877,0.528985,0.759615,0.684932,0.961538,0.8
100,0.4004,0.418385,0.836538,0.906977,0.75,0.821053


***** Running Evaluation *****
  Num examples = 208
  Batch size = 16
Saving model checkpoint to mar9/checkpoint-10
Configuration saved in mar9/checkpoint-10/config.json
Model weights saved in mar9/checkpoint-10/pytorch_model.bin
tokenizer config file saved in mar9/checkpoint-10/tokenizer_config.json
Special tokens file saved in mar9/checkpoint-10/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 208
  Batch size = 16
Saving model checkpoint to mar9/checkpoint-20
Configuration saved in mar9/checkpoint-20/config.json
Model weights saved in mar9/checkpoint-20/pytorch_model.bin
tokenizer config file saved in mar9/checkpoint-20/tokenizer_config.json
Special tokens file saved in mar9/checkpoint-20/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 208
  Batch size = 16
Saving model checkpoint to mar9/checkpoint-30
Configuration saved in mar9/checkpoint-30/config.json
Model weights saved in mar9/checkpoint-30/pytorch_model.bin
tokenizer config file 

TrainOutput(global_step=678, training_loss=0.18508839112181755, metrics={'train_runtime': 4677.7374, 'train_samples_per_second': 2.309, 'train_steps_per_second': 0.145, 'total_flos': 1171049751864000.0, 'train_loss': 0.18508839112181755, 'epoch': 6.0})

In [9]:

# # ----- 3. Predict -----#
# # # Load test data
# # test_data = df
# # X_test = list(test_data['label'])
# # X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# # Create torch dataset
# test_dataset = Dataset(X_val_tokenized)

# # Loading fine-tuned model
# model_path = 'test-trainer/checkpoint-2000'
# model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
# test_trainer = Trainer(model)


# raw_pred, _, _ = test_trainer.predict(test_dataset)
# y_pred = np.argmax(raw_pred, axis=1)

# y_pred = pd.Series(y_pred)
# goal_indices = list(t[t==1].index)

# c = 1
# for i in goal_indices:
#     print(f'Goal{c}')
#     print(X_val[i])
#     print('')
#     c += 1
