In [1]:
import torch
import pandas as pd
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from torch.nn import Embedding
import numpy as np
from torch.utils.data import DataLoader

class PersuasionStrategyDataset(Dataset):
    def __init__(
            self, data, tokenizer):
        self.data = data.reset_index(drop=True)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        self.embedding = Embedding(2,1)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
      row = self.data.iloc[idx]
      text = row.text
      label = torch.tensor(row.binary)
      return {
          'label': F.one_hot(label, num_classes=len(self.data.binary.unique())),
          'inputs' : self.tokenizer.encode_plus(
              text,
              max_length=64,
              padding='max_length',
              return_tensors='pt',
              truncation=True,
              return_attention_mask=True
          )
      }


In [2]:
from torch import nn
from transformers import RobertaModel
from torch.nn import functional as F

class RobertaClassifier(nn.Module):
    def __init__(self, tokeniser, num_classes):
        super(RobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained(tokeniser)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = self.dropout(outputs.pooler_output)
        return self.fc(x)

def init_roberta(tokeniser, classification_type, class_weights=None):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if classification_type == 'binary':
        criterion = nn.BCEWithLogitsLoss()
        num_classes = 2
    else:
        num_classes = 8
        criterion = nn.CrossEntropyLoss()

    model = RobertaClassifier(tokeniser, num_classes)

    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    return model, criterion, optimizer, device

In [3]:
from collections import Counter
from torch.utils.data import WeightedRandomSampler

def build_sampler(training_dataset):
  labels = training_dataset.data.binary # Assuming your dataset returns (input_vector, label) pairs
  class_distribution = Counter(labels)
  class_weights = {class_label: len(training_dataset) / (len(class_distribution) * class_count) for class_label, class_count in class_distribution.items()}
  weights = [class_weights[label] for label in labels]
  weights_tensor = torch.tensor(weights, dtype=torch.float)
  sampler = WeightedRandomSampler(weights_tensor, len(weights_tensor))
  return sampler


In [5]:
train_df = pd.read_csv('/content/train_df.csv').drop_duplicates(subset='text')
test_df = pd.read_csv('/content/test_df.csv').drop_duplicates(subset='text')

print(train_df.binary.value_counts())

# augmented_df = pd.read_csv('/content/augmented_df_3-EMOTION.csv').drop_duplicates(subset='text')
# augmented_df = augmented_df[~augmented_df['text'].isin(train_df['text'])]
# augmented_df = augmented_df.rename(columns={'label' : 'binary'})
# augmented_df['binary'] = augmented_df['binary'].apply(lambda x: 1 if x == '3-EMOTION' else 0)

train_df=train_df.rename(columns={'multiclass': 'binary'})
test_df=test_df.rename(columns={'multiclass': 'binary'})

# train_df = pd.concat([train_df, augmented_df])
print(train_df.binary.value_counts())
# pos = train_df[train_df.binary == 1]
# neg = train_df[train_df.binary == 0]

# train_df = pd.concat([pos, pos.sample(n=round(len(pos)*0.5)), neg.sample(n=round(len(neg)*0.7), random_state=42)])

binary
0    10756
1      419
Name: count, dtype: int64
binary
0    10756
1      419
Name: count, dtype: int64


In [6]:
tokenizer = 'roberta-base'
train_dataset = PersuasionStrategyDataset(train_df, tokenizer)
test_dataset = PersuasionStrategyDataset(test_df, tokenizer)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
sampler = build_sampler(train_dataset)

train_dataloader = DataLoader(
    train_dataset,
    batch_size = 64,
    shuffle=sampler
)

testing_dataloader = DataLoader(
    test_dataset,
    batch_size=1
)


In [8]:
train_df.binary.value_counts()

Unnamed: 0_level_0,count
binary,Unnamed: 1_level_1
0,10756
1,419


In [9]:
from torch.optim import lr_scheduler
from tqdm.auto import tqdm
from transformers import RobertaForSequenceClassification, AdamW
from transformers import get_scheduler

def init_model(tokenizer):
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss for binary classification

  model = RobertaForSequenceClassification.from_pretrained(tokenizer, num_labels=2)
  # model = BertClassifier(tokenizer, 1)
  model.to(device)

  optimizer = AdamW(model.parameters(), lr=5e-5)
  return model, criterion, optimizer, device

In [10]:
def train_model(model, criterion, optimizer, device, train_dataloader, num_epochs):
  num_training_steps = num_epochs * len(train_dataloader)

  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps
  )

  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model.to(device)

  model.train()
  with tqdm(range(num_epochs)) as t1:
    epoch_loss = 0
    for epoch in range(1, num_epochs+1):
        t1.set_description('Training Epoch: ' + str(epoch))
        t1.set_postfix(loss=epoch_loss)
        with tqdm(range(len(train_dataloader))) as t2:
          batch_loss = 0
          for batch_num, batch in enumerate(train_dataloader):
              batch_num+=1
              t2.set_description('Batch: ' + str(batch_num))
              t2.set_postfix(loss=batch_loss)
              batch = {k: v.to(device) for k, v in batch.items()}
              outputs = model(batch['inputs']['input_ids'].squeeze(1),
                              batch['inputs']['attention_mask'].squeeze(1))
              loss = criterion(outputs, batch['label'].float())
              loss.backward()

              optimizer.step()
              lr_scheduler.step()
              optimizer.zero_grad()

              batch_loss += round(loss.item()/batch_num,4)
              t2.update(1)
        epoch_loss = round(batch_loss/len(train_dataloader),4)
        t1.update(1)
  return model


In [11]:
def eval_model(model, test_dataloader):
  model.eval()

  true_labels = []
  preds = []
  for batch in tqdm(testing_dataloader):
      batch = {k: v.to(device) for k, v in batch.items()}
      true_labels.append(batch['label'].argmax().cpu())
      with torch.no_grad():
          outputs = model(batch['inputs']['input_ids'].squeeze(1),
                          batch['inputs']['attention_mask'].squeeze(1))


      logits = outputs
      predictions = torch.argmax(logits, dim=-1).cpu()
      preds.append(predictions)
  return true_labels, preds



In [12]:
num_epochs = 10
model, criterion, optimizer, device = init_roberta(tokenizer, 'binary')
model = train_model(model, criterion, optimizer, device, train_dataloader, num_epochs)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

In [13]:
true_labels, preds = eval_model(model, testing_dataloader)

  0%|          | 0/2857 [00:00<?, ?it/s]

In [14]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score
accuracy = accuracy_score(true_labels, preds)
precision = precision_score(true_labels, preds,average='weighted', zero_division=True) # Assuming multilabel
recall = recall_score(true_labels, preds, average='weighted',zero_division=True)  # Assuming multilabel
f1 = f1_score(true_labels, preds, average='weighted',zero_division=True)  # Assuming multilabel

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.9656982849142457
Precision: 0.9692845416576817
Recall: 0.9656982849142457
F1 Score: 0.9673856900963084


In [18]:
report = classification_report(
    true_labels, preds,
    zero_division=True,
    output_dict=True)  # label_names is a list of class names

report = pd.DataFrame(report)
print(report)

                     0          1  accuracy    macro avg  weighted avg
precision     0.985199   0.344828  0.965698     0.665013      0.969285
recall        0.979541   0.422535  0.965698     0.701038      0.965698
f1-score      0.982361   0.379747  0.965698     0.681054      0.967386
support    2786.000000  71.000000  0.965698  2857.000000   2857.000000


In [16]:
def output_results(model, train_dataloader, test_dataloader, output_folder):
  model.eval()
  model.save_pretrained(output_folder)


In [17]:
# model.save_pretrained('emotion_persuasion_strategy_model_binary')
report.to_csv('/content/authority_final_binary_augmented.csv', index=False)

In [None]:
# prompt: script for zipping a folder

import zipfile
import os

def zip_folder(folder_path, output_path):
    """Zips a folder and its contents.

    Args:
        folder_path: The path to the folder to zip.
        output_path: The path to the output zip file.
    """
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, arcname=os.path.relpath(file_path, folder_path))

# Example usage:
zip_folder('/content/persuasion_strategy_model', '/content/persuasion_strategy_model.zip')


In [None]:
torch.save(model, 'multiclass_model.pth')

In [None]:
import torch

# Save model state_dict
torch.save(model.state_dict(), 'multiclass_model_state_dict.pth')