<a href="https://colab.research.google.com/github/respwill/disaster_tweet/blob/main/Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchmetrics -q
!pip install transformers -q
!pip install ray[tune] -q
!pip install wandb -q
!pip install lightning -q
!pip install contractions -q
!pip install unidecode -q

In [None]:
import wandb
import contractions
from bs4 import BeautifulSoup
import pandas as pd
import torch
from torch.utils.data import Dataset

from unidecode import unidecode
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk
from transformers import T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import lightning as pl

In [None]:
wandb.init(
      mode='disabled',
      project="disaster_tweet_classification2",
      name=f"Test46",
      config={
      "learning_rate": 0.00001,
      "data_size": 1,
      "batch_size":32,
      "epochs":30,
      "weight_decay":1e-3,
       "model_name":"t5-base"
      })



In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

class textDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, train_flag=True):
        # self.tweets = pd.read_csv(data_dir)
        # self.tokenizer = AutoTokenizer.from_pretrained(wandb.config['model_name'])
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        # self.tweets = pd.read_csv(data_dir)

        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.tweet = pd.DataFrame()
        self.tweet['tweet'] = self.texts

        # if train_flag:
        #     self.tweets['target'] = self.tweets['target'].astype(int)
        #     label_1 = self.tweets[self.tweets['target'] == 1]
        #     label_0 = self.tweets[self.tweets['target'] == 0]
        #     label_0 = label_0.sample(n=len(label_1), random_state=42)
        #     data = pd.concat([label_0, label_1])
        #     self.tweets = data.sample(frac=1, random_state=42)

        # self.tweets['keyword'] = self.tweets['keyword'].fillna('')
        # self.tweets['tweet'] = self.tweets['keyword'] + ' ' + self.tweets['text']
        self.tweets = self.text_preprocessing(self.texts)
        self.tweets['target'] = self.labels
        self.tokenizer = AutoTokenizer.from_pretrained(wandb.config['model_name'])

    def rem_urls(self, data):
        regex = re.compile(f"https?://(www\.)?(\w+)(\.\w+)(/\w*)?")
        data = re.sub(regex, "", data)
        return data

    def rem_emails(self, data):
        regex = re.compile("([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+")
        data = re.sub(regex, "", data)
        return data

    def rem_mention(self, data):
        regex = re.compile('@\w+')
        data = re.sub(regex, '', data)
        return data

    def rem_accent(self, data):
        data = unidecode(data)
        return data

    def rem_unicode(self, data):
        data = data.encode("ascii", "ignore").decode()
        return data

    def rem_punc(self, data):
        data = re.sub(f"[{string.punctuation}]", " ", data)
        return data

    def clean_numbers(self, data):
        data = re.sub('[0-9]{5,}', '#'*5, data)
        data = re.sub('[0-9]{4}', '#'*4, data)
        data = re.sub('[0-9]{3}', '#'*3, data)
        data = re.sub('[0-9]{2}', '#'*2, data)
        return data


    def rem_stopwords(self, data):
        return " ".join([word for word in str(data).split() if word not in self.stop_words])

    def rem_extra_space(self, data):
        data = re.sub(' +', ' ', data).strip()
        return data


    def lemmatize_data(self, data):
        words = [self.lemmatizer.lemmatize(word) for word in data.split()]
        data = ' '.join(words)
        return data

    def text_preprocessing(self, data):
        data.loc[:,'tweet'] = data['tweet'].str.lower()
        data.loc[:,'tweet'] = data['tweet'].apply(contractions.fix)
        data.loc[:,'tweet'] = data['tweet'].apply(self.rem_urls)
        data.loc[:,'tweet'] = data['tweet'].apply(self.rem_emails)
        data.loc[:,'tweet'] = data['tweet'].apply(lambda x: BeautifulSoup(x).get_text())
        data.loc[:,'tweet'] = data['tweet'].apply(self.rem_mention)
        data.loc[:,'tweet'] = data['tweet'].str.replace(':\(', 'sadness ')
        data.loc[:,'tweet'] = data['tweet'].str.replace(r':\)[$|\s]*', 'happiness ')
        data.loc[:,'tweet'] = data['tweet'].str.replace(r'\;\)[$|\s]*', 'happiness ')
        data.loc[:,'tweet'] = data['tweet'].apply(self.rem_accent)
        data.loc[:,'tweet'] = data['tweet'].apply(self.rem_unicode)
        data.loc[:,'tweet'] = data['tweet'].apply(self.rem_punc)
        data.loc[:,'tweet'] = data['tweet'].apply(self.clean_numbers)
        data.loc[:,'tweet'] = data['tweet'].apply(self.rem_stopwords)
        data.loc[:,'tweet'] = data['tweet'].apply(self.rem_extra_space)
        data.loc[:,'tweet'] = data['tweet'].apply(self.lemmatize_data)

        return data

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        # input
        text_idx = self.tweets.columns.tolist().index('tweet')
        self.tweets.iloc[idx, [text_idx]] = self.tweets.iloc[idx, [text_idx]].apply(lambda x: 'Classify it either disaster or non-disaster: ' + x)
        text = self.tweets.iloc[idx, text_idx]
        encoding = self.tokenizer([text], return_tensors="pt", max_length=128, padding="max_length", truncation=True)
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        # output
        label_idx = self.tweets.columns.tolist().index('target')
        label = self.tweets.iloc[idx, label_idx]
        # targets = self.tokenizer(label, return_tensors="pt", max_length=128, pad_to_max_length=True)
        targets = self.tokenizer([label], return_tensors="pt", max_length=64, padding="max_length", truncation=True)
        labels = targets['input_ids'].flatten()
        label_attention_mask = targets['attention_mask'].flatten()

        return {'input_ids':input_ids, 'attention_mask':attention_mask, 'labels_ids':labels, 'label_attention_mask':label_attention_mask,  'true_label':label}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
class textClassifier(pl.LightningModule):
    def __init__(self, vocab_size=10_000, learning_rate=3e-4, cosine_t_max=100):
        super().__init__()
        self.save_hyperparameters()
        self.config = AutoConfig.from_pretrained(wandb.config['model_name'])
        self.config.num_labels = 1
        self.config.cosine_t_max = cosine_t_max
        self.model = T5ForConditionalGeneration.from_pretrained(wandb.config['model_name'])
        self.tokenizer = T5Tokenizer.from_pretrained(wandb.config['model_name'])

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None):
        return self.model(input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels)

    def _shared_step(self, batch):
        # lm_labels = batch["labels_ids"]
        # lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        # outputs = self(
        #     input_ids=batch["input_ids"],
        #     attention_mask=batch["attention_mask"],
        #     labels=lm_labels,
        #     decoder_attention_mask=batch['label_attention_mask'])


    #     loss = outputs[0]

        # pred = self.model.generate(batch["input_ids"], max_new_tokens=20, num_return_sequences=1, output_scores=True, return_dict_in_generate=True)
        pred = self.model.generate(batch["input_ids"], max_new_tokens=20)
        # print("Pred")
        # print(pred.keys())
        # print("Score:")
        # print(pred.scores[:1])
        # print(pred.scores[0].shape)
        # print("Sequences:")
        # print(pred.sequences[:2])
        # print(pred.sequences.shape)

        # print("input_ids")
        # print(batch["input_ids"])

        # scores = pred.scores.detach().cpu().numpy()
        # sequences = pred.sequences.detach().cpu().numpy()

        # dg_seq = [6912,    1,    0,    0,    0,    0,    0]
        # avg_prob_list = []
        # # each row in scores is probability vector for each token
        # for score, seq  in zip(scores, sequences):
        #     probs = torch.softmax(score, dim=-1)
        #     # generated_token_probs = probs[:, dg_seq] # generated token prob at dg
        #     # avg_prob = generated_token_probs.mean().item()
        #     avg_prob = torch.argmax(probs, dim=-1)
        #     avg_prob_list.append(avg_prob)

        predicted_labels = self.tokenizer.batch_decode(pred, skip_special_tokens=True)

        true_labels = batch["true_label"]
        # print(true_labels, predicted_labels, avg_prob_list)

        avg_prob_list = ''


        return true_labels, predicted_labels, avg_prob_list

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        true_labels, predicted_labels, avg_prob = self._shared_step(batch)
        return true_labels, predicted_labels, avg_prob

# Load data set

In [None]:
texts = pd.read_csv('/content/drive/MyDrive/ColoradoBoulder/Machine Learning/Deep learning/week4/nlp-getting-started/train.csv')
texts['keyword'] = texts['keyword'].fillna('')
texts['tweet'] = texts['keyword'] + ' ' + texts['text']
texts['target'] = texts['target'].apply(lambda x: 'disaster' if (x==1) else 'non-disaster')
labels = texts[['target']]
texts = texts[['tweet']]

In [None]:
from torch.utils.data import DataLoader, random_split, Dataset, Subset
from sklearn.model_selection import train_test_split

In [None]:
tokenizer = T5Tokenizer.from_pretrained(wandb.config['model_name'], model_max_length=128)
max_length = 128
train_idx, val_idx = train_test_split(list(range(len(texts))), train_size=0.8, test_size=0.2, random_state=42)
test_texts, test_labels = texts.iloc[val_idx], labels.iloc[val_idx]
test_dataset = textDataset(test_texts, test_labels, tokenizer, max_length)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  data.l

In [None]:
class textDatamodule(pl.LightningDataModule):
    def __init__(self, test_dataset, batch_size):
        super().__init__()
        self.test_dataset = test_dataset
        self.batch_size = batch_size

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

In [None]:
data_module = textDatamodule(test_dataset, wandb.config['batch_size'])

# Load trained model

In [None]:
try:
    del text_encoder
except:
    pass

In [None]:
text_encoder = textClassifier(vocab_size=tokenizer.vocab_size,
                              learning_rate=wandb.config['learning_rate'])
checkpoint = torch.load("/content/drive/MyDrive/ColoradoBoulder/Machine Learning/Deep learning/week4/Test52-epoch=2-train_loss=0.0728-val_loss=0.0847.ckpt",
                        map_location=torch.device('cpu'))
text_encoder.load_state_dict(checkpoint['state_dict'], strict=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<All keys matched successfully>

In [None]:
trainer = pl.Trainer(accelerator='cpu',
                    devices=1
                    )

INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
from torch.utils.data import DataLoader, TensorDataset, Subset

In [None]:
# test_data_loader = DataLoader(Subset(data_module.test_dataset, [0,1,2,3,4,5]), batch_size=3)

In [None]:
# prediction = trainer.predict(text_encoder, test_data_loader)

In [None]:
prediction = trainer.predict(text_encoder, data_module)

Predicting: |          | 0/? [00:00<?, ?it/s]

In [None]:
import pickle
with open('prediction.pkl', 'wb') as f:
    pickle.dump(prediction, f)

In [None]:
len(data_module.test_dataset)

In [None]:
with open('prediction.pkl', 'rb') as f:
    prediction = pickle.load(f)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
labels = []
preds = []
prob_list = []
for true_labels, predicted_labels, _ in prediction:
    labels.extend(true_labels)
    preds.extend(predicted_labels)
    # prob_list.extend(avg_prob_list)

In [None]:
from sklearn.metrics import classification_report

In [None]:
report = classification_report(labels, preds)

In [None]:
print(report)

              precision    recall  f1-score   support

    disaster       0.90      0.64      0.75       649
non-disaster       0.78      0.95      0.86       874

    accuracy                           0.82      1523
   macro avg       0.84      0.79      0.80      1523
weighted avg       0.83      0.82      0.81      1523



In [None]:
pred_df = pd.DataFrame({'labels':labels, 'preds':preds})

# Summary
|Model|Trial |Accuracy|F1 score|
|-----|------|--------|--------|
|T5   |test48|0.78    |0.77    |
|T5   |test49|0.83    |0.83    |
|T5   |test52|0.82    |0.81    |