In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tokenizers
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
MODEL = "FacebookAI/roberta-base"

# df = df[1:10]
class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
## Test Dataset
class SarcasmTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item
    def __len__(self):
        return len(self.encodings)
    
    
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    f1 = f1_score(labels, pred)

    return {"accuracy": accuracy,"f1_score":f1}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, add_prefix_space=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
df = pd.read_csv("data/isarcasm2022.csv")
df = df[['tweet', 'sarcastic']]
df.dropna(inplace=True)
    
x_train, x_val, y_train, y_val = train_test_split(df['tweet'], df['sarcastic'], test_size=0.2, random_state=34)

x_train = x_train.values.tolist()
y_train = y_train.values.tolist() 
x_val = x_val.values.tolist()
y_val = y_val.values.tolist()

In [34]:
len(x_train)

2773

In [None]:
train_encodings = tokenizer(text=x_train, truncation=True, padding=True, is_split_into_words=True, return_tensors='pt')

{'input_ids': tensor([[    0,   787,   100, 29631,  9335, 28908,  3006,  1952,    33,    10,
           169,     9,   442,    82,  1032,   101,  7105,     7,    28,  2638,
             8,  5324,    30,   106, 50121, 50118, 50121, 50118,   243,    18,
           761,     9,  3444,     6,    11,    10, 42647,    62,   169,   787,
         20780, 41925, 10494, 16911,    85,   269, 19230,   162,    77,  2948,
         16511,  1115,     8, 18241,   239,   462, 38361,   130,    50,   237,
           498,   228,  1040,    77, 10032,   111, 11253,    86,  1052,   328,
           849, 33939,    38,  2813,   939,  2638,   127,   809,   787,  3952,
           293, 12010,   787, 22026, 28612,  2596,  3303,  5314,    19,    70,
            14,    37,    26,    19,     5,  1198,  9578,   139,    14,  6321,
             8,  3701,   989,   137,     5,    94,     9,     5,  3517,   314,
         17220,  6025,    21, 15478,    18,  1307,  5849,    20,  2105,    12,
         21693,   841,    11,    70,  