In [None]:
! pip install transformers datasets ray

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.9 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.2.1-py3-none-any.whl (342 kB)
[K     |████████████████████████████████| 342 kB 68.7 MB/s 
[?25hCollecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 70.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 60.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 4.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import cuda, nn
import random
import os
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import optuna

import datasets
from datasets import Dataset, load_metric

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

In [None]:
model_checkpoint = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to('cuda')

Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/532M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Base BART on raw data

In [None]:
train_path = './train.csv'
train_data = pd.read_csv(train_path)
train_data = train_data[['text', 'target']]
train_data.rename(columns={"target": "label"}, inplace=True)

test_path = './test.csv'
test_data = pd.read_csv(test_path)
test_data = test_data[['text']]
test_data.rename(columns={"target": "label"}, inplace=True)

In [None]:
train_data, dev_data = train_test_split(train_data, test_size=0.1, shuffle=True, stratify=train_data['label'])

print("Train dataset labels count = ", Counter(train_data['label']))
print("Dev dataset labels count = ", Counter(dev_data['label']))

Train dataset labels count =  Counter({0: 3907, 1: 2944})
Dev dataset labels count =  Counter({0: 435, 1: 327})


In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

In [None]:
train_data = Dataset.from_pandas(train_data)
dev_data = Dataset.from_pandas(dev_data)
test_data = Dataset.from_pandas(test_data)

encoded_dataset_train = train_data.map(preprocess_function, batched=True)
encoded_dataset_dev = dev_data.map(preprocess_function, batched=True)
encoded_dataset_test = test_data.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
columns_to_return = ['input_ids', 'label', 'attention_mask']
columns_to_return_test = ['input_ids', 'attention_mask']
encoded_dataset_train.set_format(columns=columns_to_return)
encoded_dataset_dev.set_format(columns=columns_to_return)
encoded_dataset_test.set_format(columns=columns_to_return_test)

In [None]:
batch_size = 8
metric_name = "f1"
model_name = model_checkpoint.split("/")[-1]
task = 'tweet'

def compute_metrics(p):
    metric_acc = load_metric(metric_name)    
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return metric_acc.compute(predictions=preds, references=p.label_ids)

args = TrainingArguments(
    f"./save_model/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    logging_dir='./logs'
)

trainer1 = Trainer(
    model,
    args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_dev,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer1.train()

The following columns in the training set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6851
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6856


Epoch,Training Loss,Validation Loss,F1
1,0.432,0.380233,0.796178
2,0.3904,0.469073,0.804805
3,0.3666,0.553508,0.806697
4,0.3298,0.732933,0.793605
5,0.2986,0.661576,0.803545
6,0.2646,0.758963,0.79822
7,0.2544,0.830839,0.797059
8,0.2356,0.818155,0.793462


The following columns in the evaluation set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Saving model checkpoint to ./save_model/bart-base-finetuned-tweet/checkpoint-857
Configuration saved in ./save_model/bart-base-finetuned-tweet/checkpoint-857/config.json
Model weights saved in ./save_model/bart-base-finetuned-tweet/checkpoint-857/pytorch_model.bin
tokenizer config file saved in ./save_model/bart-base-finetuned-tweet/checkpoint-857/tokenizer_config.json
Special tokens file saved in ./save_model/bart-base-finetuned-tweet/checkpoint-857/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
Saving model checkpoint to ./save_model/bart-base-finetuned-tweet/checkpoint-1714
Configuration saved in ./save_model/bart-base-finetuned-tweet/checkpoint-

TrainOutput(global_step=6856, training_loss=0.3222639535661339, metrics={'train_runtime': 875.5383, 'train_samples_per_second': 62.599, 'train_steps_per_second': 7.831, 'total_flos': 1652956837080360.0, 'train_loss': 0.3222639535661339, 'epoch': 8.0})

### Base BART on pre-processed data

In [None]:
train_path = './clean_train.csv'
train_data = pd.read_csv(train_path)
train_data = train_data[['text with sw', 'target']]
train_data.rename(columns={"target": "label"}, inplace=True)

test_path = './clean_test.csv'
test_data = pd.read_csv(test_path)
test_data = test_data[['text with sw']]
test_data.rename(columns={"target": "label"}, inplace=True)

In [None]:
train_data, dev_data = train_test_split(train_data, test_size=0.1, shuffle=True, stratify=train_data['label'])

print("Train dataset labels count = ", Counter(train_data['label']))
print("Dev dataset labels count = ", Counter(dev_data['label']))
#print("Test dataset labels count = ", Counter(test_data['target'])) #test dataset does not contain the target label

Train dataset labels count =  Counter({0: 3907, 1: 2944})
Dev dataset labels count =  Counter({0: 435, 1: 327})


In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text with sw'], truncation=True)

In [None]:
train_data = Dataset.from_pandas(train_data)
dev_data = Dataset.from_pandas(dev_data)
test_data = Dataset.from_pandas(test_data)

encoded_dataset_train = train_data.map(preprocess_function, batched=True)
encoded_dataset_dev = dev_data.map(preprocess_function, batched=True)
encoded_dataset_test = test_data.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
columns_to_return = ['input_ids', 'label', 'attention_mask']
columns_to_return_test = ['input_ids', 'attention_mask']
encoded_dataset_train.set_format(columns=columns_to_return)
encoded_dataset_dev.set_format(columns=columns_to_return)
encoded_dataset_test.set_format(columns=columns_to_return_test)

In [None]:
batch_size = 8
metric_name = "f1"
model_name = model_checkpoint.split("/")[-1]
task = 'tweet'

def compute_metrics(p):
    metric_acc = load_metric(metric_name)    
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return metric_acc.compute(predictions=preds, references=p.label_ids)

args = TrainingArguments(
    f"./save_model/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    logging_dir='./logs'
)

trainer2 = Trainer(
    model,
    args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_dev,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer2.train()

The following columns in the training set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text with sw, __index_level_0__. If text with sw, __index_level_0__ are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6851
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6856


Epoch,Training Loss,Validation Loss,F1
1,0.3919,0.460909,0.806052
2,0.3716,0.476734,0.827586
3,0.3274,0.387631,0.848
4,0.3068,0.496287,0.839763
5,0.2632,0.54457,0.828614
6,0.2535,0.580324,0.832827
7,0.2684,0.594724,0.835098
8,0.2076,0.637833,0.834575


The following columns in the evaluation set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text with sw, __index_level_0__. If text with sw, __index_level_0__ are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
Saving model checkpoint to ./save_model/bart-base-finetuned-tweet/checkpoint-857
Configuration saved in ./save_model/bart-base-finetuned-tweet/checkpoint-857/config.json
Model weights saved in ./save_model/bart-base-finetuned-tweet/checkpoint-857/pytorch_model.bin
tokenizer config file saved in ./save_model/bart-base-finetuned-tweet/checkpoint-857/tokenizer_config.json
Special tokens file saved in ./save_model/bart-base-finetuned-tweet/checkpoint-857/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been i

TrainOutput(global_step=6856, training_loss=0.2963775681503456, metrics={'train_runtime': 949.0163, 'train_samples_per_second': 57.752, 'train_steps_per_second': 7.224, 'total_flos': 1086120325688580.0, 'train_loss': 0.2963775681503456, 'epoch': 8.0})

In [None]:
#get_test_predictions
predictions = trainer2.predict(encoded_dataset_test)
preds = np.argmax(predictions.predictions[0], axis=-1)
#print the top 100 examples
for i in range(100):
    print(encoded_dataset_test['text with sw'][i], preds[i], sep='\t')

The following columns in the test set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text with sw. If text with sw are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3262
  Batch size = 8


just happened a terrible car crash	1
heard about earthquake is different cities  stay safe everyone 	1
there is a forest fire at spot pond  geese are fleeing across the street  i cannot save them all	1
apocalypse lighting  spokane wildfires	1
typhoon soudelor kills  in china and taiwan	1
we are shaking  it s an earthquake	1
they would probably still show more life than arsenal did yesterday  eh  eh 	0
hey  how are you 	0
what a nice hat 	0
fuck off 	0
no i do not like cold 	0
noo  do not do that 	0
no do not tell me that 	0
awesome 	0
birmingham wholesale market is ablaze bbc news   fire breaks out at birmingham s wholesale market	1
will you wear shorts for race ablaze  	0
previouslyondoyintv  toke makinwa ûªs marriage crisis sets nigerian twitter ablaze  	0
check these out  nsfw	0
psa  i ûªm splitting my personalities     techies follow co    burners follow	0
beware world ablaze sierra leone  amp  guap 	0
burning man ablaze  by turban diva via	1
not a diss song  people will take  thin