In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = 'FacebookAI/roberta-base'

tokenizer = AutoTokenizer.from_pretrained(model_name, num_labels=4)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

# Define device (CPU or GPU)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [None]:
import pandas as pd

train_url = '/kaggle/input/exist-2024-tweets-dataset/EXIST 2024 Tweets Dataset/training/EXIST2024_training.json'
val_url = '/kaggle/input/exist-2024-tweets-dataset/EXIST 2024 Tweets Dataset/dev/EXIST2024_dev.json'
test_url = '/kaggle/input/exist-2024-tweets-dataset/EXIST 2024 Tweets Dataset/test/EXIST2023_test_clean.json'

df_train = pd.read_json(train_url)
df_val = pd.read_json(val_url)
df_test = pd.read_json(test_url)

df_train = df_train.transpose()
df_val = df_val.transpose()
df_test = df_test.transpose()

df_train['id_EXIST'] = df_train['id_EXIST'].astype(int)
df_val['id_EXIST'] = df_val['id_EXIST'].astype(int)

train_labels = pd.read_json('/kaggle/input/exist-2024-tweets-dataset/golds/EXIST2024_training_task2_gold_hard.json')
val_labels = pd.read_json('/kaggle/input/exist-2024-tweets-dataset/golds/EXIST2024_dev_task2_gold_hard.json')

df_train = pd.merge(df_train,train_labels, left_on='id_EXIST' ,right_on='id' , how='inner')
df_val = pd.merge(df_val,val_labels, left_on='id_EXIST' ,right_on='id' , how='inner')

df_train['value'] = df_train['value'].astype(str)
df_val['value'] = df_val['value'].astype(str)

df_train.head()

In [None]:
df_train['Label'] = 0
df_val['Label'] = 0

def Labeling(df):
    for index, row in df.iterrows():
        labels = row['value']
        if labels == 'DIRECT':
            df.at[index, 'Label'] = 0 
        elif labels == 'REPORTED':
            df.at[index, 'Label'] = 1
        elif labels == 'JUDGEMENTAL' :
            df.at[index, 'Label'] = 2
        else:
            df.at[index, 'Label'] = 3
    return df


df_train = Labeling(df_train)
df_val= Labeling(df_val)

In [None]:
data_no = 5

# Prepare the training data
train_texts = df_train['tweet'].tolist()
train_labels = df_train['Label'].tolist()

val_texts = df_val['tweet'].tolist()
val_labels = df_val['Label'].tolist()

In [None]:
# Tokenize and encode the training texts
train_encodings = tokenizer(train_texts, truncation=True, max_length=128,
        padding='max_length', return_tensors = 'pt')

# Convert the labels to tensors
train_labels =torch.tensor(train_labels)


# Create a PyTorch dataset
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'],
                                               train_encodings['attention_mask'],
                                               train_labels)

# Create a data loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)



model = model.to(device)

In [None]:
# from tqdm.notebook import tqdm

# for batch in tqdm(train_loader):

#     print(f"batch_0 {batch[0]} batch_1 {batch[1]} batch_2 {batch[2]}")

In [None]:
from tqdm.notebook import tqdm
import torch
import torch.optim as optim
import torch.nn as nn
from sklearn.metrics import accuracy_score

# Set the model to training mode
model.train()

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

losses = []
accuracies = []  # To store accuracy per epoch
num_epochs = 5 #changed to 7
# Training loop
for epoch in tqdm(range(num_epochs)):  # Number of training epochs
    running_loss = 0.0
    predicted_labels = []  # To store predicted labels for accuracy calculation
    true_labels = []  # To store true labels for accuracy calculation

    for batch in tqdm(train_loader):
        
#         print(f"Max label: {labels.max().item()}, Number of classes: {4}")
        
#         assert labels.max().item() < 4, "An index in labels is out of range"
        
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels =  batch[2].to(device) 
        
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
#         optimizer.step()

        running_loss += loss.item()

        # Convert logits to predicted labels
        _, predicted = torch.max(logits, dim=1)
        predicted_labels.extend(predicted.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

    epoch_loss = running_loss / len(train_loader)
    losses.append(epoch_loss)

    # Calculate and store accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    accuracies.append(accuracy)

    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {epoch_loss:.4f} - Accuracy: {accuracy:.4f}')

# Save the model
torch.save(model.state_dict(), 'model.pth')

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def predict_labels(text):
    train_encodings = tokenizer(text, truncation=True, max_length=128,
        padding='max_length', return_tensors = 'pt')
    input_ids = train_encodings['input_ids'].to(device)
    attention_mask = train_encodings['attention_mask'].to(device)

    # Set the model to evaluation mode
    model.eval()

    # Disable gradient calculation
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1)
    

    return predicted_class.item(), probabilities[:,1].item()
predicted_labels = []
predicted_probs = []
for text in tqdm(val_texts):
    predicted_label, prob = predict_labels(text)
    predicted_labels.append(predicted_label)
    predicted_probs.append(prob)

# Calculate accuracy and F1 score
accuracy = accuracy_score(val_labels, predicted_labels)
# f1 = f1_score(valid_labels, predicted_labels)
# roc_auc = roc_auc_score(valid_labels, predicted_probs)

print('Accuracy:', accuracy)
# print('F1 Score:', f1)'xlm-roberta-base'
# print('ROC-AUC:', roc_auc)
from sklearn.metrics import roc_auc_score, classification_report
print(model_name + f"    epoch = {num_epochs}")
print('\nThe Classification Report is as follows\n')
print(classification_report(val_labels, predicted_labels, digits = 4))