In [None]:
! pip install transformers datasets

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.0 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.2.0-py3-none-any.whl (342 kB)
[K     |████████████████████████████████| 342 kB 50.8 MB/s 
[?25hCollecting ray[tune]
  Downloading ray-1.12.0-cp37-cp37m-manylinux2014_x86_64.whl (53.2 MB)
[K     |████████████████████████████████| 53.2 MB 90 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 32.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 35.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1


In [None]:
import pandas as pd
import numpy as np
import torch
from torch import cuda
import random
import os
import torch
from torch import nn
from transformers import Trainer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import datasets
from datasets import Dataset, load_metric
import transformers
from transformers import BartTokenizer, BartForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

In [None]:
model_checkpoint = 'facebook/bart-large'
tokenizer = BartTokenizer.from_pretrained(model_checkpoint)
model = BartForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to('cuda')

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/971M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Large pre-trained BART on raw data

In [None]:
train_path = './train.csv'
train_data = pd.read_csv(train_path)
train_data = train_data[['text', 'target']]
train_data.rename(columns={"target": "label"}, inplace=True)

test_path = './test.csv'
test_data = pd.read_csv(test_path)
test_data = test_data[['text']]
test_data.rename(columns={"target": "label"}, inplace=True)

In [None]:
train_data, dev_data = train_test_split(train_data, test_size=0.1, shuffle=True, stratify=train_data['label'])

print("Train dataset labels count = ", Counter(train_data['label']))
print("Dev dataset labels count = ", Counter(dev_data['label']))
#print("Test dataset labels count = ", Counter(test_data['target'])) #test dataset does not contain the target label

Train dataset labels count =  Counter({0: 3907, 1: 2944})
Dev dataset labels count =  Counter({0: 435, 1: 327})


In [None]:
model_checkpoint = 'facebook/bart-large'
tokenizer = BartTokenizer.from_pretrained(model_checkpoint)
model = BartForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to('cuda')

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/971M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.weight', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

In [None]:
train_data = Dataset.from_pandas(train_data)
dev_data = Dataset.from_pandas(dev_data)
test_data = Dataset.from_pandas(test_data)

encoded_dataset_train = train_data.map(preprocess_function, batched=True)
encoded_dataset_dev = dev_data.map(preprocess_function, batched=True)
encoded_dataset_test = test_data.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
columns_to_return = ['input_ids', 'label', 'attention_mask']
columns_to_return_test = ['input_ids', 'attention_mask']
encoded_dataset_train.set_format(columns=columns_to_return)
encoded_dataset_dev.set_format(columns=columns_to_return)
encoded_dataset_test.set_format(columns=columns_to_return_test)

In [None]:
batch_size = 8
metric_name = "f1"
model_name = model_checkpoint.split("/")[-1]
task = 'tweet'

def compute_metrics(p):
    metric_acc = load_metric(metric_name)    
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return metric_acc.compute(predictions=preds, references=p.label_ids)

args = TrainingArguments(
    f"./save_model/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    logging_dir='./logs'
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_dev,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6851
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3428


Epoch,Training Loss,Validation Loss,F1
1,0.4686,0.426675,0.808642
2,0.3704,0.55996,0.801262
3,0.3212,0.561971,0.806349
4,0.2891,0.68846,0.794349


The following columns in the evaluation set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Saving model checkpoint to ./save_model/bart-large-finetuned-tweet/checkpoint-857
Configuration saved in ./save_model/bart-large-finetuned-tweet/checkpoint-857/config.json
Model weights saved in ./save_model/bart-large-finetuned-tweet/checkpoint-857/pytorch_model.bin
tokenizer config file saved in ./save_model/bart-large-finetuned-tweet/checkpoint-857/tokenizer_config.json
Special tokens file saved in ./save_model/bart-large-finetuned-tweet/checkpoint-857/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
Saving model checkpoint to ./save_model/bart-large-finetuned-tweet/checkpoint-1714
Configuration saved in ./save_model/bart-large-finetuned-tweet/chec

TrainOutput(global_step=3428, training_loss=0.36328108975600754, metrics={'train_runtime': 1765.7468, 'train_samples_per_second': 15.52, 'train_steps_per_second': 1.941, 'total_flos': 2925590038130520.0, 'train_loss': 0.36328108975600754, 'epoch': 4.0})

In [None]:
#get_test_predictions
predictions = trainer.predict(encoded_dataset_test)
preds = np.argmax(predictions.predictions[0], axis=-1)
#print the top 100 examples
for i in range(100):
    print(encoded_dataset_test['text'][i], preds[i], sep='\t')

The following columns in the test set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3263
  Batch size = 8


Just happened a terrible car crash	1
Heard about #earthquake is different cities, stay safe everyone.	1
there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all	1
Apocalypse lighting. #Spokane #wildfires	1
Typhoon Soudelor kills 28 in China and Taiwan	1
We're shaking...It's an earthquake	1
They'd probably still show more life than Arsenal did yesterday, eh? EH?	0
Hey! How are you?	0
What a nice hat?	0
Fuck off!	0
No I don't like cold!	0
NOOOOOOOOO! Don't do that!	0
No don't tell me that!	0
What if?!	0
Awesome!	0
Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU	1
@sunkxssedharry will you wear shorts for race ablaze ?	0
#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriage crisis sets Nigerian Twitter ablaze... http://t.co/CMghxBa2XI	0
Check these out: http://t.co/rOI2NSmEJJ http://t.co/3Tj8ZjiN21 http://t.co/YDUiXEfIpE http://t.co/LxTjc87KLS #nsfw	0
PSA: IÛªm splitting my personali

In [None]:
#!zip -r /content/bart-notuning.zip /content/save_model/bart-large-finetuned-tweet/checkpoint-857

### Large pre-trained BART on pre-processed data

In [None]:
train_path = './clean_train.csv'
train_data = pd.read_csv(train_path)
train_data = train_data[['text with sw', 'target']]
train_data.rename(columns={"target": "label"}, inplace=True)

test_path = './clean_test.csv'
test_data = pd.read_csv(test_path)
test_data = test_data[['text with sw']]
test_data.rename(columns={"target": "label"}, inplace=True)

train_data, dev_data = train_test_split(train_data, test_size=0.1, shuffle=True, stratify=train_data['label'])

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text with sw'], truncation=True)

In [None]:
train_data = Dataset.from_pandas(train_data)
dev_data = Dataset.from_pandas(dev_data)
test_data = Dataset.from_pandas(test_data)

encoded_dataset_train = train_data.map(preprocess_function, batched=True)
encoded_dataset_dev = dev_data.map(preprocess_function, batched=True)
encoded_dataset_test = test_data.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
columns_to_return = ['input_ids', 'label', 'attention_mask']
columns_to_return_test = ['input_ids', 'attention_mask']
encoded_dataset_train.set_format(columns=columns_to_return)
encoded_dataset_dev.set_format(columns=columns_to_return)
encoded_dataset_test.set_format(columns=columns_to_return_test)

In [None]:
batch_size = 8
metric_name = "f1"
model_name = model_checkpoint.split("/")[-1]
task = 'tweet'

def compute_metrics(p):
    metric_acc = load_metric(metric_name)    
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return metric_acc.compute(predictions=preds, references=p.label_ids)

args = TrainingArguments(
    f"./save_model/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    logging_dir='./logs'
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_dev,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text with sw, __index_level_0__. If text with sw, __index_level_0__ are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6851
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3428


Epoch,Training Loss,Validation Loss,F1
1,0.4937,0.380164,0.810127
2,0.3782,0.412318,0.805687
3,0.3434,0.464731,0.824615
4,0.3024,0.508325,0.81141


The following columns in the evaluation set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text with sw, __index_level_0__. If text with sw, __index_level_0__ are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Saving model checkpoint to ./save_model/bart-large-finetuned-tweet/checkpoint-857
Configuration saved in ./save_model/bart-large-finetuned-tweet/checkpoint-857/config.json
Model weights saved in ./save_model/bart-large-finetuned-tweet/checkpoint-857/pytorch_model.bin
tokenizer config file saved in ./save_model/bart-large-finetuned-tweet/checkpoint-857/tokenizer_config.json
Special tokens file saved in ./save_model/bart-large-finetuned-tweet/checkpoint-857/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text with sw, __index_level_0__. If text with sw, __index_level_0__ are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
Saving model checkpoint to ./save_model/bart-large-finetuned-tweet/checkpoint-1714
Configuration saved in ./save_model/bart-large-fine

TrainOutput(global_step=3428, training_loss=0.3782220304081849, metrics={'train_runtime': 1002.9143, 'train_samples_per_second': 27.324, 'train_steps_per_second': 3.418, 'total_flos': 1933465440424380.0, 'train_loss': 0.3782220304081849, 'epoch': 4.0})

In [None]:
#get_test_predictions
predictions = trainer.predict(encoded_dataset_test)
preds = np.argmax(predictions.predictions[0], axis=-1)
#print the top 100 examples
for i in range(100):
    print(encoded_dataset_test['text with sw'][i], preds[i], sep='\t')

The following columns in the test set  don't have a corresponding argument in `BartForSequenceClassification.forward` and have been ignored: text with sw. If text with sw are not expected by `BartForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3262
  Batch size = 8


just happened a terrible car crash	1
heard about earthquake is different cities  stay safe everyone 	1
there is a forest fire at spot pond  geese are fleeing across the street  i cannot save them all	1
apocalypse lighting  spokane wildfires	1
typhoon soudelor kills  in china and taiwan	1
we are shaking  it s an earthquake	1
they would probably still show more life than arsenal did yesterday  eh  eh 	0
hey  how are you 	0
what a nice hat 	0
fuck off 	0
no i do not like cold 	0
noo  do not do that 	0
no do not tell me that 	0
awesome 	0
birmingham wholesale market is ablaze bbc news   fire breaks out at birmingham s wholesale market	1
will you wear shorts for race ablaze  	0
previouslyondoyintv  toke makinwa ûªs marriage crisis sets nigerian twitter ablaze  	0
check these out  nsfw	0
psa  i ûªm splitting my personalities     techies follow co    burners follow	0
beware world ablaze sierra leone  amp  guap 	1
burning man ablaze  by turban diva via	1
not a diss song  people will take  thin