In [27]:
import numpy as np
import pandas as pd

import spacy
nlp = spacy.load("en_core_web_sm")
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

from sklearn.model_selection import train_test_split

import tensorflow as tf
from transformers import AutoModelForSequenceClassification 

In [212]:
train_df = pd.read_csv('events_set1/canada_wildfires_2016/canada_wildfires_2016_train.tsv',sep='\t')
test_df = pd.read_csv('events_set1/canada_wildfires_2016/canada_wildfires_2016_test.tsv',sep='\t')

In [213]:
train_df.head()

Unnamed: 0,tweet_id,tweet_text,class_label
0,735891446960623616,RT @DonBradshawNTV: How @MarshallAmpsUK came t...,other_relevant_information
1,731202020296818688,Red Cross distributes $30M to Fort McMurray wi...,displaced_people_and_evacuations
2,733665357236342784,Interesting insights on the shifting communica...,other_relevant_information
3,731963038429929472,RT @globeandmail: Oil sands producers helping ...,rescue_volunteering_or_donation_effort
4,728674838034944001,Ottawa to match Red Cross donations for Fort M...,rescue_volunteering_or_donation_effort


In [214]:
disasters = ["caution_and_advice",
             "displaced_people_and_evacuations",
             "infrastructure_and_utility_damage",
             "injured_or_dead_people"]
for i in range(len(train_df)):
    if(train_df.loc[i,'class_label'] in disasters):
        train_df.loc[i,'labels'] = 1
    else:
        train_df.loc[i,'labels'] = 0
        
for i in range(len(test_df)):
    if(test_df.loc[i,'class_label'] in disasters):
        test_df.loc[i,'labels'] = 1
    else:
        test_df.loc[i,'labels'] = 0



In [215]:
train_df['labels'] = train_df['labels'].astype(int)
test_df['labels'] = test_df['labels'].astype(int)

In [216]:
train_df.rename(columns={'tweet_text': 'text'}, inplace=True)
test_df.rename(columns={'tweet_text': 'text'}, inplace=True)
train_df.rename(columns={'tweet_id': 'input_ids'}, inplace=True)
test_df.rename(columns={'tweet_id': 'input_ids'}, inplace=True)

train_df.drop('class_label', axis=1, inplace=True)
test_df.drop('class_label', axis=1, inplace=True)


In [217]:
test_df.head()

Unnamed: 0,input_ids,text,labels
0,728674116773904384,RT @FoothillsFCU23: In response the to the #Fo...,0
1,729787427829612544,Redcross is offering charitable donation recei...,0
2,730510385544085505,RT @globeandmail: Red Cross to transfer $50-mi...,0
3,733705874594746368,Live: Emergency operations briefing on north A...,0
4,730606066023665665,"$9bn fire damage to Fort McMurray, â€˜the beastâ€™...",1


In [218]:
train_df.head()

Unnamed: 0,input_ids,text,labels
0,735891446960623616,RT @DonBradshawNTV: How @MarshallAmpsUK came t...,0
1,731202020296818688,Red Cross distributes $30M to Fort McMurray wi...,1
2,733665357236342784,Interesting insights on the shifting communica...,0
3,731963038429929472,RT @globeandmail: Oil sands producers helping ...,0
4,728674838034944001,Ottawa to match Red Cross donations for Fort M...,0


In [219]:
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)

In [220]:
import torch
from datasets import Dataset

In [221]:
train_dataset = Dataset.from_pandas(train_df, split="train")
test_dataset = Dataset.from_pandas(test_df, split="test")

In [222]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

In [223]:
# This function tokenizes the input text using the RoBERTa tokenizer. 
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

Map:   0%|          | 0/1569 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

In [224]:
print(test_dataset)

Dataset({
    features: ['input_ids', 'text', 'labels', 'attention_mask'],
    num_rows: 445
})


In [225]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

class_names = ["caution_and_advice",
             "displaced_people_and_evacuations",
             "Dont know cant judge",
             "infrastructure_and_utility_damage",
             "injured_or_dead_people",
             "missing_or_found_people",
             "not_humanitarian",
             "other_relevant_information",
             "requests_or_urgent_needs",
             "rescue_volunteering_or_donation_effort",
             "sympathy_and_support",
            ]
id2label = {i: label for i, label in enumerate(class_names)}

In [226]:
model_id = "roberta-base"

In [129]:
for i, label in enumerate(class_names):
    print(i)
    print(label)

0
caution_and_advice
1
displaced_people_and_evacuations
2
Dont know cant judge
3
infrastructure_and_utility_damage
4
injured_or_dead_people
5
missing_or_found_people
6
not_humanitarian
7
other_relevant_information
8
requests_or_urgent_needs
9
rescue_volunteering_or_donation_effort
10
sympathy_and_support


In [227]:
config = AutoConfig.from_pretrained(model_id, num_labels=2)

In [228]:
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [229]:
repository_id = "tweets_analysis_test"

In [233]:
# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    remove_unused_columns=False,
)

In [244]:
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [245]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [246]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0002,0.77519,0.91236,0.868687
2,0.0012,1.249159,0.81573,0.774725
3,0.4155,0.495425,0.876404,0.798535
4,0.676,0.634636,0.67191,0.0
5,0.6083,0.642982,0.67191,0.0


  load_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Checkpoint destination directory tweets_analysis_test/checkpoint-197 already exists and is non-empty.Saving will proceed but saved results may be invalid.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Checkpoint destination directory tweets_analysis_test/checkpoint-394 already exists and is non-empty.Saving will proceed but saved results may be invalid.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.


In [248]:
trainer.evaluate()