In [1]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, random_split, Dataset, Subset
# from torchtext import data
# from torchvision import transforms, models
from torchmetrics import Accuracy
import pytorch_lightning as pl
from pytorch_lightning.utilities.model_summary import ModelSummary
from pytorch_lightning.loggers import WandbLogger
from PIL import Image
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingLR
import wandb
import torchmetrics
import sys
from ray.tune.integration.pytorch_lightning import TuneReportCallback
from ray import tune
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

  warn(f"Failed to load image Python extension: {e}")


In [2]:
wandb.init(
      mode='disabled',
      # Set the project where this run will be logged
      project="disaster_tweet_classification", 
      # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
      name=f"Test7", 
      # Track hyperparameters and run metadata
      config={
      "learning_rate": 0.000003,
      "data_size": 1,
      "batch_size":32,
        "model_name":"distilbert-base-uncased"#"cardiffnlp/twitter-xlm-roberta-base-sentiment" #
      })
wandb_logger = WandbLogger()

  rank_zero_warn(


# Lightning moduel

In [3]:
class textClassifier(pl.LightningModule):
    def __init__(self, vocab_size=10_000, learning_rate=3e-4):
        super().__init__()
        self.save_hyperparameters()
        self.accuracy = Accuracy(task='binary')
        self.config = AutoConfig.from_pretrained(wandb.config['model_name'])
        self.config.num_labels = 1
        self.hugging_face_model = AutoModelForSequenceClassification.from_config(self.config)
#         self.hugging_face_model = AutoModelForSequenceClassification.from_pretrained(wandb.config['model_name'], num_labels=1)
        
    def forward(self, **inputs):
        return self.hugging_face_model(**inputs)
    
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        return self(**batch)
    
    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs[0]
        logits = outputs[1]
        pred = logits.squeeze()
        labels = batch['labels'].flatten().to(torch.int64)
        acc = self.accuracy(pred, labels)
        self.log("train_loss", loss, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_epoch=True, prog_bar=True)
        
        return {'loss':loss, 'pred':pred}
        
    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs[0]
        logits = outputs[1]
        pred = logits.squeeze()
        labels = batch['labels'].flatten().to(torch.int64)
        acc = self.accuracy(pred, labels)
        self.log("val_loss", loss, on_epoch=True, prog_bar=True)
        self.log('val_acc', acc, on_epoch=True, prog_bar=True)
        
        return {'val_loss':loss, 'pred':pred}
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adamax(self.parameters(), lr=self.hparams.learning_rate)
        return [optimizer]    

# Dataset

In [4]:
import contractions
from bs4 import BeautifulSoup
from unidecode import unidecode
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

class textDataset(Dataset):
    def __init__(self, data_dir):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.tweets = pd.read_csv(data_dir)
        self.tweets['keyword'] = self.tweets['keyword'].fillna('')
        self.tweets['tweet'] = self.tweets['keyword'] + ' ' + self.tweets['text']
        self.tweets = self.text_preprocessing(self.tweets)
        self.tokenizer = AutoTokenizer.from_pretrained(wandb.config['model_name'])
    
    def rem_urls(self, data):
        regex = re.compile(f"https?://(www\.)?(\w+)(\.\w+)(/\w*)?")
        data = re.sub(regex, "", data)
        return data
    
    def rem_emails(self, data):
        regex = re.compile("([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+")
        data = re.sub(regex, "", data)
        return data
    
    def rem_mention(self, data):
        regex = re.compile('@\w+')
        data = re.sub(regex, '', data)
        return data
    
    def rem_accent(self, data):
        data = unidecode(data)
        return data
    
    def rem_unicode(self, data):
        data = data.encode("ascii", "ignore").decode()
        return data
    
    def rem_punc(self, data):
        data = re.sub(f"[{string.punctuation}]", " ", data)
        return data
    
    def clean_numbers(self, data):
        data = re.sub('[0-9]{5,}', '#'*5, data)
        data = re.sub('[0-9]{4}', '#'*4, data)
        data = re.sub('[0-9]{3}', '#'*3, data)
        data = re.sub('[0-9]{2}', '#'*2, data)
        return data
    
    
    def rem_stopwords(self, data):
        return " ".join([word for word in str(data).split() if word not in self.stop_words])
    
    def rem_extra_space(self, data):
        data = re.sub(' +', ' ', data).strip()
        return data
    
    
    def lemmatize_data(self, data):
        words = [self.lemmatizer.lemmatize(word) for word in data.split()]
        data = ' '.join(words)
        return data
    
    def text_preprocessing(self, data):
        data['tweet'] = data['tweet'].str.lower()
        data['tweet'] = data['tweet'].apply(contractions.fix)
        data['tweet'] = data['tweet'].apply(self.rem_urls)
        data['tweet'] = data['tweet'].apply(self.rem_emails)
        data['tweet'] = data['tweet'].apply(lambda x: BeautifulSoup(x).get_text())
        data['tweet'] = data['tweet'].apply(self.rem_mention)
        data['tweet'] = data['tweet'].str.replace(':\(', 'sadness ')
        data['tweet'] = data['tweet'].str.replace(r':\)[$|\s]*', 'happiness ')
        data['tweet'] = data['tweet'].str.replace(r'\;\)[$|\s]*', 'happiness ')
        data['tweet'] = data['tweet'].apply(self.rem_accent)
        data['tweet'] = data['tweet'].apply(self.rem_unicode)
        data['tweet'] = data['tweet'].apply(self.rem_punc)
        data['tweet'] = data['tweet'].apply(self.clean_numbers)
        data['tweet'] = data['tweet'].apply(self.rem_stopwords)
        data['tweet'] = data['tweet'].apply(self.rem_extra_space)
        data['tweet'] = data['tweet'].apply(self.lemmatize_data)
        return data
        
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, idx):
        text = self.tweets.iloc[idx, -1]
        encoding = self.tokenizer.batch_encode_plus([text], return_tensors="pt", max_length=128, pad_to_max_length=True)
        input_ids = encoding['input_ids'].flatten()#.to(torch.float32)
        attention_mask = encoding['attention_mask'].flatten()
        label = torch.tensor([self.tweets.iloc[idx, -2]]).to(torch.float32)
        
#         label = torch.tensor(label, dtype=torch.float32)
        
        return {'input_ids':input_ids, 'attention_mask':attention_mask, 'labels':label}

In [5]:
def train_val_dataset(dataset, train_split=0.75, val_split=0.25):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), train_size=train_split, test_size=val_split)
    datasets = {'train':Subset(dataset, train_idx), 'val':Subset(dataset, val_idx)}
    return datasets

In [6]:
class textDatamodule(pl.LightningDataModule):
    def __init__(self, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.tokenizer = None
    def setup(self, stage=None):
        data = textDataset("./nlp-getting-started/train.csv")
        self.tokenizer = data.tokenizer
        data = train_val_dataset(data)
        self.train_dataset = data['train']
        self.val_dataset = data['val']
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)
    
    def predict_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

In [7]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath='./check_point/',
    filename='{epoch}-{train_loss:.4f}-{val_loss:.4f}',
    monitor='val_loss',
    mode='min',
    save_top_k=2
)

In [8]:
trainer = pl.Trainer(accelerator='gpu',
                    devices=1,
                    max_epochs=1000,
                    logger=wandb_logger,
                    callbacks=[checkpoint_callback]
                    )

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [9]:
dm = textDatamodule(batch_size=wandb.config['batch_size'])
dm.setup()

  data['tweet'] = data['tweet'].str.replace(':\(', 'sadness ')
  data['tweet'] = data['tweet'].str.replace(r':\)[$|\s]*', 'happiness ')
  data['tweet'] = data['tweet'].str.replace(r'\;\)[$|\s]*', 'happiness ')


In [10]:
text_encoder = textClassifier(vocab_size=dm.tokenizer.vocab_size, 
                              learning_rate=wandb.config['learning_rate'])
checkpoint = torch.load("./check_point/distil_bert_uncased/epoch=35-train_loss=0.1043-val_loss=0.1219.ckpt")
text_encoder.load_state_dict(checkpoint['state_dict'], strict=False)

_IncompatibleKeys(missing_keys=[], unexpected_keys=['lstm1.weight_ih_l0', 'lstm1.weight_hh_l0', 'lstm1.bias_ih_l0', 'lstm1.bias_hh_l0', 'lstm1.weight_ih_l0_reverse', 'lstm1.weight_hh_l0_reverse', 'lstm1.bias_ih_l0_reverse', 'lstm1.bias_hh_l0_reverse', 'lstm1.weight_ih_l1', 'lstm1.weight_hh_l1', 'lstm1.bias_ih_l1', 'lstm1.bias_hh_l1', 'lstm1.weight_ih_l1_reverse', 'lstm1.weight_hh_l1_reverse', 'lstm1.bias_ih_l1_reverse', 'lstm1.bias_hh_l1_reverse', 'lstm2.weight_ih_l0', 'lstm2.weight_hh_l0', 'lstm2.bias_ih_l0', 'lstm2.bias_hh_l0', 'lstm2.weight_ih_l0_reverse', 'lstm2.weight_hh_l0_reverse', 'lstm2.bias_ih_l0_reverse', 'lstm2.bias_hh_l0_reverse', 'lstm2.weight_ih_l1', 'lstm2.weight_hh_l1', 'lstm2.bias_ih_l1', 'lstm2.bias_hh_l1', 'lstm2.weight_ih_l1_reverse', 'lstm2.weight_hh_l1_reverse', 'lstm2.bias_ih_l1_reverse', 'lstm2.bias_hh_l1_reverse', 'lstm3.weight_ih_l0', 'lstm3.weight_hh_l0', 'lstm3.bias_ih_l0', 'lstm3.bias_hh_l0', 'lstm3.weight_ih_l0_reverse', 'lstm3.weight_hh_l0_reverse', 'lst

In [11]:
prediction = trainer.predict(text_encoder, dm)

  data['tweet'] = data['tweet'].str.replace(':\(', 'sadness ')
  data['tweet'] = data['tweet'].str.replace(r':\)[$|\s]*', 'happiness ')
  data['tweet'] = data['tweet'].str.replace(r'\;\)[$|\s]*', 'happiness ')
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
val_labels = []
for t in dm.val_dataloader():
    val_labels.extend(t['labels'])
val_labels = torch.cat(val_labels)

In [13]:
val_preds = []
for p in prediction:
    val_preds.extend(p['logits'])
val_preds = torch.cat(val_preds)

In [14]:
val_preds = torch.round(val_preds)

# Accuray on validation set

In [15]:
from sklearn.metrics import accuracy_score, f1_score

In [16]:
acc = accuracy_score(val_labels, val_preds)
f1 = f1_score(val_labels, val_preds)

In [17]:
acc, f1

(0.8581932773109243, 0.8333333333333334)

# AUC curve

# Result Summary

|Model|Accuracy|F1 score|Dataset|Threshold|
|----|---------|--------|-------|---------|
|distilbert-base-uncased|0.86|0.83|Validation|0.5|
|cardiffnlp/twitter-xlm-roberta-base-sentiment|0.85|0.81|Validation|0.5|
