In [1]:
from imports import *
from utils import *
import torch.nn as nn
from transformers import RobertaModel
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
import pytorch_lightning as pl
from transformers import RobertaTokenizerFast
from torch.utils.data import DataLoader
import random
import transformers

import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)

In [2]:
class Config:
    PATH = 'training.1600000.processed.noemoticon.csv'
    LR = 1e-5
    MAX_LEN = 64
    BATCH_SIZE = 64
    SEED = 42
    train_ratio = 0.9
    test_ratio = 0.1
    num_workers = 8
    roberta_model = "roberta-base"
    tokenizer = RobertaTokenizerFast.from_pretrained(roberta_model)

def log_execution(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        logging.info(f"Executing {func.__name__}")
        result = func(*args, **kwargs)
        logging.info(f"Finished executing {func.__name__}")
        return result
    return wrapper

def timing_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function {func.__name__} took {end_time - start_time} seconds to run.")
        return result
    return wrapper

def seed_everything(seed=Config.SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)

Device: cuda


In [3]:
class Sentiment140Dataset(Dataset):
    def __init__(self):
        self.path = Config.PATH
        self.tokenizer = Config.tokenizer
        self.cleaned = False
        self.load_data()
        
    @log_execution    
    def load_data(self):
        """
        Loads the data.
        """
        self.data = pd.read_csv(self.path, header=None, names=['targets', 'ids', 'date', 'flag', 'user', 'text'], 
                           encoding='latin-1')
        self.data.targets = self.data.targets.replace({4: 1})
        self.check_for_dups()
        # self.check_targets()
        # self.X, self.y = self.data.text, self.data.targets #Series
        # self.X, self.y = self.data.text.to_numpy(), self.data.targets.to_numpy().astype(np.uint8) #numpy
        self.X, self.y = self.data.text.tolist(), self.data.targets.tolist() #List

    @timing_decorator
    def deep_clean(self):
        # List: 370.0727105140686 seconds to run.
        # Series: 372.0254681110382 seconds to run.
        # Numpy: 371.67559838294983 seconds to run.
        # For list
        # Add stop words removal
        self.X =  list(map(TextPreprocessor.preprocess_text, self.X))
        # self.X =  list(map(TextPreprocessor.clean_text, self.X))
        # self.X =  list(map(TextPreprocessor.remove_stopwords, self.X))
        # self.X =  list(map(TextPreprocessor.stemming, self.X))
        # For Numpy
        # self.X = np.vectorize(TextPreprocessor.decontract)(self.X)
        # self.X = np.vectorize(TextPreprocessor.clean_text)(self.X)
        # self.X = np.vectorize(TextPreprocessor.stemming)(self.X)
        # # For Series
        # self.X = self.X.apply(TextPreprocessor.decontract)
        # self.X = self.X.apply(TextPreprocessor.clean_text)
        # self.X = self.X.apply(TextPreprocessor.stemming)

    def apply_cleaning(self):
        if not self.cleaned:  # check if data has been cleaned
            self.deep_clean()
            print("Done cleaning data")
            self.cleaned = True
    
    def find_max_len(self):
        self.max_len = self.data['text'].str.len().max()
        print("Maximum Length: ",self.max_len)
        
    def check_targets(self):
        print("Target value counts:", self.data.targets.value_counts())

    def check_for_dups(self):
        # print('number of duplicates: ', self.data.text.duplicated().sum())
        if self.data.text.duplicated().sum() > 0:
            self.data.drop_duplicates('text', inplace=True)
            # print("Done removing duplicates")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        self.apply_cleaning()
        X, y = self.X[i], self.y[i]
        encoding = self.tokenizer.encode_plus(
            X,
            add_special_tokens = True,
            max_length=Config.MAX_LEN,
            pad_to_max_length=True,
            truncation='longest_first',
            # truncation=True,
            # padding="max_length",
            return_tensors="pt",
        )
        input_ids = encoding["input_ids"][0] #[0]
        attention_mask = encoding["attention_mask"][0] #[0]
        labels =  torch.tensor(y, dtype=torch.float)
        return {'text': X,
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': labels
                }

In [55]:
def prepare_loaders():
    ds = Sentiment140Dataset()
    seed_everything()
    dataset_size = len(ds)
    indices = list(range(dataset_size))
    split = int(np.floor(Config.test_ratio * dataset_size))
    seed_everything()
    np.random.shuffle(indices)
    train_indices, test_indices = indices[split:], indices[:split]

        # create samplers for train and test sets
    train_sampler = SubsetRandomSampler(train_indices)
    test_sampler = SubsetRandomSampler(test_indices)

        # create data loaders for train and test sets
    train_loader = DataLoader(ds, batch_size=Config.BATCH_SIZE, sampler=train_sampler)
    val_loader = DataLoader(ds, batch_size=Config.BATCH_SIZE, sampler=test_sampler)
    return train_loader, val_loader

train_loader, val_loader = prepare_loaders()

INFO:root:Executing load_data
INFO:root:Finished executing load_data


In [7]:
class RoBERTaModel(pl.LightningModule):
    def __init__(self)-> None:
        super().__init__()
        self.prepare_loaders()
        self.roberta = RobertaModel.from_pretrained(Config.roberta_model)
        self.dropout = nn.Dropout(p=0.2)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, 1)
        self.loss_fn = nn.BCEWithLogitsLoss()

    def prepare_loaders(self):
        ds = Sentiment140Dataset()
        seed_everything()
        dataset_size = len(ds)
        indices = list(range(dataset_size))
        split = int(np.floor(Config.test_ratio * dataset_size))
        seed_everything()
        np.random.shuffle(indices)
        train_indices, test_indices = indices[split:], indices[:split]

        # create samplers for train and test sets
        train_sampler = SubsetRandomSampler(train_indices)
        test_sampler = SubsetRandomSampler(test_indices)

        # create data loaders for train and test sets
        self.train_loader = DataLoader(ds, batch_size=Config.BATCH_SIZE, sampler=train_sampler)
        self.val_loader = DataLoader(ds, batch_size=Config.BATCH_SIZE, sampler=test_sampler)

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        # self.prepare_data()
        return self.val_loader

    def forward(self, input_ids, attention_mask)-> torch.Tensor:
        output = self.roberta(input_ids=input_ids,
                              attention_mask=attention_mask)
        pooled_output = output.pooler_output
        # dropout_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)
    
    def accuracy(self, preds, labels):
        """
        Computes accuracy for binary classification task.
        """
        # round predictions to the closest integer
        rounded_preds = torch.round(torch.sigmoid(preds))
        # compute accuracy
        acc = (rounded_preds == labels).float().mean()
        return acc

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        outputs = self(input_ids, attention_mask)
        loss = self.loss_fn(outputs.view(-1), labels.view(-1))
        acc = self.accuracy(outputs.view(-1), labels.view(-1))
        self.log("train_loss", loss, prog_bar=True)
        self.log('train_acc', acc, prog_bar=True)
        return {"loss": loss, 
                "acc": acc}
        
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        outputs = self(input_ids, attention_mask)
        loss = self.loss_fn(outputs.view(-1), labels.view(-1))
        acc = self.accuracy(outputs.view(-1), labels.view(-1))
        self.log("valid_loss", loss)
        self.log('valid_acc', acc, prog_bar=True)
        return {"loss": loss, 
                "acc": acc}

    def configure_optimizers(self):
        # optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        # return optimizer
    
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        return transformers.AdamW(optimizer_parameters, lr=Config.LR)

    def predict(self, text):
        encoded_text = Config.tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            max_length=Config.MAX_LEN,
            pad_to_max_length=True,
            truncation='longest_first',
            # truncation=True,
            # padding="max_length",
            return_tensors="pt",
        )
        output = self(encoded_text['input_ids'][0], encoded_text['attention_mask'][0])
        probabilities = torch.softmax(output.logits, dim=1)
        predicted_label = torch.argmax(probabilities, dim=1)
        return predicted_label.item()

In [8]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

early_stop_callback = EarlyStopping(
   monitor='val_loss',
   min_delta=0.00,
   patience=2,
   verbose=False,
   mode='min'
)

checkpoint_callback = ModelCheckpoint(
    monitor='valid_loss',
    dirpath='checkpoints',
    filename='model-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,
    mode='min',
)

torch.set_float32_matmul_precision('medium')
# training_args = pl.TrainingArguments(
#     ,
#     output_dir='results_roberta',          # output directory
#     overwrite_output_dir = True,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True
# )

model = RoBERTaModel()
trainer = pl.Trainer(accelerator='gpu',
                     max_epochs = 1,
                     callbacks=[checkpoint_callback]
)
trainer.fit(model)
best_model_path = checkpoint_callback.best_model_path
best_model = model.load_from_checkpoint(best_model_path)
torch.save(best_model.state_dict(), 'best_model.pt')

INFO:root:Executing load_data
INFO:root:Finished executing load_data
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type            

Sanity Checking: 0it [00:00, ?it/s]

Function deep_clean took 582.6112864017487 seconds to run.
Done cleaning data


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
INFO:root:Executing load_data
INFO:root:Finished executing load_data
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: not enough values to unpack (expected 2, got 1)

In [21]:
pred_model = model.load_from_checkpoint('checkpoints/model-epoch=00-val_loss=0.00.ckpt')

INFO:root:Executing load_data
INFO:root:Finished executing load_data
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
def replace_sentiment(pred):
    if pred == 0:
        return "Negative"
    elif pred == 1:
        return "Positive"
    else:
        return "Unknown"

In [60]:
encoded_text = Config.tokenizer.encode_plus(
        "I hate you",
        add_special_tokens = True,
        max_length=Config.MAX_LEN,
        pad_to_max_length=True,
        truncation='longest_first',
        return_tensors="pt",
)
input_ids = encoded_text["input_ids"] #[0]
attention_mask = encoded_text["attention_mask"] #[0]
pred_model = model.eval()

In [61]:

output = pred_model(input_ids, attention_mask)
pred = torch.argmax(output.data).item()
sentiment = replace_sentiment(pred)
sentiment

Passed output


'Negative'

In [62]:
pred_model = model.load_from_checkpoint('checkpoints/model-epoch=00-val_loss=0.00.ckpt')
def replace_sentiment(pred):
    if pred == 0:
        return "Negative"
    elif pred == 1:
        return "Positive"
    else:
        return "Unknown"

def predict(model, text):
    encoded_text = Config.tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length=Config.MAX_LEN,
        pad_to_max_length=True,
        truncation='longest_first',
        return_tensors="pt",
    )
    input_ids = encoded_text["input_ids"] #[0]
    attention_mask = encoded_text["attention_mask"] #[0]
    model.eval()
    with torch.no_grad():
        print("Entered no Grad")
        output = model(input_ids.to('cuda'), attention_mask.to('cuda'))
        print("Passed output")
        pred = torch.argmax(output).item()
        sentiment = replace_sentiment(pred)
    return sentiment

y_pred = predict(pred_model, "I hate you")
print(y_pred)

INFO:root:Executing load_data
INFO:root:Finished executing load_data
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Entered no Grad
Passed output
Negative
