<a href="https://colab.research.google.com/github/roykallaye/TyDiP-for-Colab/blob/main/TyDiP_regressor_colab_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy
!pip install torch==1.9.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
!pip install pytorch-lightning==1.5.0
!pip install scikit-learn
!pip install transformers
!pip install pandas
!pip install polyglot
!pip install pyicu
!pip install pycld2
!pip install morfessor
!pip install datasets

Looking in links: https://download.pytorch.org/whl/torch_stable.html
[31mERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu111 (from versions: 1.11.0, 1.11.0+cpu, 1.11.0+cu102, 1.11.0+cu113, 1.11.0+cu115, 1.11.0+rocm4.3.1, 1.11.0+rocm4.5.2, 1.12.0, 1.12.0+cpu, 1.12.0+cu102, 1.12.0+cu113, 1.12.0+cu116, 1.12.0+rocm5.0, 1.12.0+rocm5.1.1, 1.12.1, 1.12.1+cpu, 1.12.1+cu102, 1.12.1+cu113, 1.12.1+cu116, 1.12.1+rocm5.0, 1.12.1+rocm5.1.1, 1.13.0, 1.13.0+cpu, 1.13.0+cu116, 1.13.0+cu117, 1.13.0+cu117.with.pypi.cudnn, 1.13.0+rocm5.1.1, 1.13.0+rocm5.2, 1.13.1, 1.13.1+cpu, 1.13.1+cu116, 1.13.1+cu117, 1.13.1+cu117.with.pypi.cudnn, 1.13.1+rocm5.1.1, 1.13.1+rocm5.2, 2.0.0, 2.0.0+cpu, 2.0.0+cpu.cxx11.abi, 2.0.0+cu117, 2.0.0+cu117.with.pypi.cudnn, 2.0.0+cu118, 2.0.0+rocm5.3, 2.0.0+rocm5.4.2, 2.0.1, 2.0.1+cpu, 2.0.1+cpu.cxx11.abi, 2.0.1+cu117, 2.0.1+cu117.with.pypi.cudnn, 2.0.1+cu118, 2.0.1+rocm5.3, 2.0.1+rocm5.4.2, 2.1.0, 2.1.0+cpu, 2.1.0+cpu.cxx11.abi, 2.1.0+cu118, 2.1.0+cu12

In [None]:
import argparse
from typing import List, Dict
from google.colab import drive
drive.mount('/content/drive')

import os

os.chdir('/content/drive/MyDrive/Colab Notebooks/TyDiP')

import sys
import numpy as np
import pytorch_lightning as pl
from pytorch_lightning import seed_everything
import sklearn.metrics
import sklearn.model_selection
import torch
import torch.optim
import torch.utils.data
import transformers
import pandas as pd
import random

Mounted at /content/drive


  from torch.distributed.optim import DistributedOptimizer


In [None]:
try:
    from polyglot.text import Text
except:
    print("polyglot not installed. Cannot use --strategy_words")

In [None]:
class MyDataModule(pl.LightningDataModule):
    def __init__(self, train_file, test_file, binary, tokenizer, max_length, batch_size, strategy_words_replacement_negate=False, strategy_words=None, random_masking_ratio=None):
        super().__init__()
        self.train_file = train_file
        self.test_file = test_file
        self.binary = binary
        self.max_length = max_length
        self.batch_size = batch_size
        self.tokenizer = tokenizer

        if strategy_words:
            self.strategy_words = pd.read_csv(strategy_words)
            self.strategy_words = set(list(self.strategy_words.values[:, 1:].reshape(-1)))
        else:
            self.strategy_words = None
        self.strategy_words_replacement_negate = strategy_words_replacement_negate
        self.random_masking_ratio = random_masking_ratio

    @staticmethod
    def read_file(file_name, text_only=False):
        if file_name.split(".")[-1] == "csv":
            df = pd.read_csv(file_name)
            data = [(a, b) for a, b in zip(list(df['sentence']), df['score'])]
            if text_only:
                data = [t[0] for t in data]
        else:
            data = open(file_name).read().strip().split('\n')
        return data

    def setup(self, stage=None):
        if self.train_file:
            self.train_data = MyDataModule.read_file(self.train_file)
            self.train_data, self.val_data = sklearn.model_selection.train_test_split(self.train_data, shuffle=False, test_size=0.2)
        if self.test_file:
            self.test_data = MyDataModule.read_file(self.test_file)

    def prepare_dataloader(self, mode):
        if mode == "train":
            data = self.train_data
        elif mode == "val":
            data = self.val_data
        else:
            data = self.test_data

        # tokenized = self.tokenizer([t[0] for t in data], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        tokenized = MyDataModule.tokenize([t[0] for t in data], self.tokenizer, self.max_length, self.strategy_words_replacement_negate, self.strategy_words, self.random_masking_ratio)
        if self.binary:
            labels = torch.tensor([t[1] > 0 for t in data], dtype=int)
        else:
            labels = torch.tensor([t[1] for t in data])

        if mode == "train":
            weights = torch.zeros_like(labels)
            weights[labels == 0] = labels.shape[0] - labels.sum()
            weights[labels == 1] = labels.sum()
            return torch.utils.data.DataLoader(torch.utils.data.TensorDataset(tokenized['input_ids'], tokenized['attention_mask'], labels), batch_size=self.batch_size, sampler=torch.utils.data.WeightedRandomSampler(1 / weights, len(weights), replacement=True))
        else:
            return torch.utils.data.DataLoader(torch.utils.data.TensorDataset(tokenized['input_ids'], tokenized['attention_mask'], labels), batch_size=self.batch_size)

    @staticmethod
    def tokenize(data: List[str], tokenizer, max_length, strategy_words_replacement_negate, strategy_words, random_masking_ratio):
        if strategy_words is not None or random_masking_ratio is not None:
            tokenized_data = []
            for sentence in data:
                words = Text(sentence).words
                words = [t.lower() for t in words]
                if strategy_words:
                    words = [t if ((t in strategy_words) != strategy_words_replacement_negate) else tokenizer.mask_token for t in words]
                elif random_masking_ratio:
                    words = [t if random.random() <= random_masking_ratio else tokenizer.mask_token for t in words]
                tokenized_data.append(' '.join(words))
            out = tokenizer(tokenized_data, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
            # out['attention_mask'] = torch.tensor(out['input_ids'] != tokenizer.pad_token_id, dtype=int)
            return out
        else:
            return tokenizer(data, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

    def train_dataloader(self):
        return self.prepare_dataloader("train")
        # return torch.utils.data.DataLoader(MyDataModule.CustomDataset1(self.tokenizer, self.train_data, self.max_length), batch_size=self.batch_size)

    def test_dataloader(self):
        return self.prepare_dataloader("test")
        # return torch.utils.data.DataLoader(MyDataModule.CustomDataset1(self.tokenizer, self.test_data, self.max_length), batch_size=self.batch_size)

    def val_dataloader(self):
        return self.prepare_dataloader("val")
        # return torch.utils.data.DataLoader(MyDataModule.CustomDataset1(self.tokenizer, self.val_data, self.max_length), batch_size=self.batch_size)


# New section

In [None]:
class RegressionModel(pl.LightningModule):
    def __init__(self, pretrained_model, binary, learning_rate, num_warmup_steps, tokenizer):
        super(RegressionModel, self).__init__()
        self.save_hyperparameters()
        self.pretrained_model = pretrained_model
        self.binary = binary
        self.learning_rate = learning_rate
        self.num_warmup_steps = num_warmup_steps
        self.tokenizer = tokenizer
        self.model = transformers.AutoModelForSequenceClassification.from_pretrained(self.pretrained_model, num_labels=2 if self.binary else 1)

    def forward(self, **kwargs):
        return self.model(**kwargs)

    def training_step(self, batch, batch_idx):
        outputs = self.forward(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs['loss']
        ret = {"loss": loss}
        if self.binary:
            acc = torch.tensor(batch[2] == torch.argmax(outputs['logits']), dtype=float).mean().item()
            ret["acc"] = acc
        else:
            rmse = (torch.mean((batch[2] - outputs['logits'])**2)**0.5).item()
            ret["rmse"] = rmse

        return {"loss": loss, "log": ret}

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        scheduler = transformers.get_linear_schedule_with_warmup(optimizer, self.num_warmup_steps, len(self.trainer.datamodule.train_dataloader()) // self.trainer.accumulate_grad_batches)
        return [optimizer], [{"scheduler": scheduler}]

    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx, mode="test")

    def validation_step(self, batch, batch_idx, mode="val"):
        outputs = self.forward(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs['loss']
        self.log("{}_loss".format(mode), loss, prog_bar=True)

        ret = {"loss": loss}
        if self.binary:
            preds = torch.argmax(outputs['logits'], axis=1).tolist()
            gold = batch[2].tolist()
            ret["preds"] = preds
            ret["gold"] = gold
            # f1 = sklearn.metrics.f1_score(gold, preds)
            # acc = sklearn.metrics.accuracy_score(gold, preds)
            # ret["acc"] = acc
            # ret["f1"] = f1
            # self.log("{}_acc".format(mode), acc, prog_bar=True)
            # self.log("{}_f1".format(mode), f1, prog_bar=True)
        else:
            preds = outputs['logits'].tolist()
            gold = batch[2].tolist()
            ret['preds'] = preds
            ret['gold'] = gold
            # rmse = (torch.mean((batch[2] - outputs['logits'])**2)**0.5).item()
            # self.log("{}_rmse".format(mode), rmse, prog_bar=True)
            # ret["rmse"] = rmse

        return {"loss": loss, "log": ret}

    def validation_epoch_end(self, outputs, mode="val"):
        gold = []
        preds = []
        for batch in outputs:
            gold.extend(batch['log']['gold'])
            preds.extend(batch['log']['preds'])
        if self.binary:
            f1 = sklearn.metrics.f1_score(gold, preds)
            acc = sklearn.metrics.accuracy_score(gold, preds)
            self.log("{}_acc".format(mode), acc, prog_bar=True)
            self.log("{}_f1".format(mode), f1, prog_bar=True)
        else:
            rmse = (torch.mean((torch.tensor(gold) - torch.tensor(preds))**2)**0.5).item()
            self.log("{}_rmse".format(mode), rmse, prog_bar=True)

    def test_epoch_end(self, outputs):
        return self.validation_epoch_end(outputs, mode="test")

    def predict_step(self, batch, batch_idx):
        preds = self.forward(input_ids=batch[0], attention_mask=batch[1])
        if self.binary:
            ret = preds['logits'].tolist()
        else:
            ret = preds['logits'].view(-1).tolist()
        return ret

    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = parent_parser.add_argument_group("RegressionModel")
        parser.add_argument('--pretrained_model', type=str)
        parser.add_argument('--learning_rate', type=float, default="5e-6")
        parser.add_argument('--num_warmup_steps', type=float, default="0")
        return parent_parser

In [None]:
def main():
    # Argument Parser
    parser = argparse.ArgumentParser()
    parser.add_argument("--train", action="store_true")
    parser.add_argument("--test", action="store_true")
    parser.add_argument("--load_model", type=str)
    parser.add_argument("--train_file", type=str)
    parser.add_argument("--test_file", type=str)
    parser.add_argument("--binary", action="store_true")
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--batch_size", type=int, default=64)
    parser.add_argument("--max_length", type=int, default=128)
    parser.add_argument("--model_save_location", type=str)
    parser.add_argument("--preds_save_location", type=str)
    parser.add_argument("--preds_save_logits", action="store_true")
    parser.add_argument("--strategy_words", type=str)
    parser.add_argument("--strategy_words_replacement_negate", action="store_true")
    parser.add_argument("--random_masking_ratio", type=float)

    # Add model-specific arguments and trainer-specific arguments
    parser = RegressionModel.add_model_specific_args(parser)
    parser = pl.Trainer.add_argparse_args(parser)

    # Use parse_known_args() to ignore any unrecognized arguments (such as those passed by Jupyter/Colab)
    args, unknown = parser.parse_known_args()

    # Print recognized arguments and unknown arguments for debugging purposes
    print("Recognized arguments:", args)
    print("Unknown arguments:", unknown)

    # Seed everything
    seed_everything(seed=args.seed)

    # Load model or initialize new model
    if args.load_model:
      # Load the tokenizer using the model name
      tokenizer = transformers.AutoTokenizer.from_pretrained(args.load_model)

      # Instantiate the RegressionModel with the necessary parameters
      model = RegressionModel(
          pretrained_model=args.load_model,
          binary=args.binary,
          learning_rate=args.learning_rate,
          num_warmup_steps=args.num_warmup_steps,
          tokenizer=tokenizer
      )

    else:
        # Set the pretrained model to 'xlm-roberta-large' as specified in the README
        pretrained_model = "Genius1237/xlm-roberta-large-tydip"
        tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model)
        model = RegressionModel(
            pretrained_model=pretrained_model,
            binary=True,
            learning_rate=args.learning_rate,
            num_warmup_steps=args.num_warmup_steps,
            tokenizer=tokenizer
        )

    # Initialize Trainer
    trainer = pl.Trainer.from_argparse_args(args)

    # Load dataset
    dataset = MyDataModule(
        train_file=args.train_file,
        test_file=args.test_file,
        binary=model.binary,
        max_length=args.max_length,
        batch_size=args.batch_size,
        tokenizer=tokenizer,
        strategy_words_replacement_negate=args.strategy_words_replacement_negate,
        strategy_words=args.strategy_words,
        random_masking_ratio=args.random_masking_ratio
    )
    dataset.setup()

    # Train model if --train argument is provided
    if args.train:
        trainer.fit(model, dataset)

    # Test model if --test argument is provided
    if args.test:
        trainer.test(model, dataset.test_dataloader())

    # Save predictions if --preds_save_location argument is provided
    if args.preds_save_location:
        data = MyDataModule.read_file(args.test_file, True)
        strategy_words = None
        if args.strategy_words:
            strategy_words = pd.read_csv(args.strategy_words)
            strategy_words = set(list(args.strategy_words.values[:, 1:].reshape(-1)))
        tokenized = MyDataModule.tokenize(data, tokenizer, args.max_length, args.strategy_words_replacement_negate, strategy_words, args.random_masking_ratio)
        input_data = torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(tokenized['input_ids'], tokenized['attention_mask']),
            batch_size=args.batch_size
        )
        preds = trainer.predict(model, input_data, return_predictions=True)
        preds = [t for y in preds for t in y]
        preds = torch.tensor(preds)
        if model.binary:
            if args.preds_save_logits:
                preds = torch.softmax(preds, axis=1)[:, 1].tolist()
            else:
                preds = preds.argmax(axis=1).tolist()
        else:
            preds = preds.view(-1).tolist()
        preds = [str(t) for t in preds]

        with open(args.preds_save_location, 'w') as f:
            f.write('\n'.join(preds) + '\n')

    # Save model if --model_save_location argument is provided
    if args.model_save_location:
        trainer.save_checkpoint(args.model_save_location, weights_only=True)

SKIP 3 CELLS

In [None]:
sys.argv = [
    'politeness_regressor.py',  # Your script name, adjust if necessary
    #'--train',                   # Indicates that you want to train the model
    '--train_file', 'data/binary/en_train_binary.csv',  # Path to your training file
    '--test_file', 'data/binary/en_test_binary.csv',    # Path to your testing file
    '--model_save_location', 'model.pt',  # Path to save the trained model
    '--gpus', '1',               # Use 1 GPU for training
    '--batch_size', '4',         # Set your desired batch size
    '--max_epochs', '5',         # Number of training epochs
    '--learning_rate', '5e-6',   # Set learning rate
    '--checkpoint_callback', 'False',  # Disable checkpointing
    '--logger', 'False',         # Disable logging
    '--binary',                  # Set this if your model is binary classification
]

In [None]:
if __name__ == "__main__":
    main()

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Recognized arguments: Namespace(train=False, test=False, load_model=None, train_file='data/binary/en_train_binary.csv', test_file='data/binary/en_test_binary.csv', binary=True, seed=42, batch_size=4, max_length=128, model_save_location='model.pt', preds_save_location=None, preds_save_logits=False, strategy_words=None, strategy_words_replacement_negate=False, random_masking_ratio=None, pretrained_model=None, learning_rate=5e-06, num_warmup_steps=0.0, logger=False, checkpoint_callback=False, enable_checkpointing=True, default_root_dir=None, gradient_clip_val=None, gradient_clip_algorithm=None, process_position=0, num_nodes=1, num_processes=1, devices=None, gpus=1, auto_select_gpus=False, tpu_cores=None, ipus=None, log_gpu_memory=None, progress_bar_refresh_rate=None, enable_progress_bar=True, overfit_batches=0.0, track_grad_norm=-1, check_val_every_n_epoch=1, fast_dev_run=False, accumulate_grad_batches=None, max_epochs=5, min_epochs=None, max_steps=-1, min_steps=None, max_time=None, limit

  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs


AttributeError: 'NoneType' object has no attribute 'state_dict'

In [None]:
pkl_file_path = 'archive/data.pkl'

with open(pkl_file_path, 'rb') as f:
    file_content = f.read()
    print(file_content[:500])  # Print the first 500 characters to inspect


b'\x80\x02}q\x00(X\x05\x00\x00\x00epochq\x01K\x05X\x0b\x00\x00\x00global_stepq\x02M\x8b\x07X\x19\x00\x00\x00pytorch-lightning_versionq\x03X\x05\x00\x00\x001.5.0q\x04X\n\x00\x00\x00state_dictq\x05ccollections\nOrderedDict\nq\x06)Rq\x07(X/\x00\x00\x00model.roberta.embeddings.word_embeddings.weightq\x08ctorch._utils\n_rebuild_tensor_v2\nq\t((X\x07\x00\x00\x00storageq\nctorch\nFloatStorage\nq\x0bX\x01\x00\x00\x000q\x0cX\x03\x00\x00\x00cpuq\rJ\x00HB\x0ftq\x0eQK\x00J\x92\xd0\x03\x00M\x00\x04\x86q\x0fM\x00\x04K\x01\x86q\x10\x89h\x06)Rq\x11tq\x12Rq\x13X3\x00\x00\x00model.roberta.embeddings.position_embeddings.weightq\x14h\t((h\nh\x0bX\x01\x00\x00\x001q\x15h\rJ\x00\x08\x08\x00tq\x16QK\x00M\x02\x02M\x00\x04\x86q\x17M\x00\x04K\x01\x86q\x18\x89h\x06)Rq\x19tq\x1aRq\x1bX5\x00\x00\x00model.roberta.embeddings.token_type_embeddings.weightq\x1ch'


RESUME RUNNING CELLS

In [None]:
from pytorch_lightning import LightningModule
from transformers import XLMRobertaForSequenceClassification

# Load the model using your LightningModule class
model = RegressionModel.load_from_checkpoint('model.pt')

  return torch.load(f, map_location=map_location)
  from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [None]:
# Load the test dataset
test_file_path = 'data/binary/fr_test_binary.csv'
#test_data = pd.read_csv(test_file_path)

test_data = pd.read_csv(test_file_path, encoding='ISO-8859-1')


In [None]:
# from datasets import load_dataset

# # Load the test dataset for the desired language (e.g., English)
# test_dataset = load_dataset("path.to.your.TyDiP.py", "en", split="test")


In [None]:
from transformers import XLMRobertaTokenizer
import torch
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader

# Load tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True)  # Use padding and truncation for fixed-size inputs

# Apply tokenization
tokenized_test_dataset = test_data['sentence'].map(tokenize_function)



In [None]:
# Create DataLoader (batch_size can be increased, but 1 is fine for now)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1)

# Load your model
model = RegressionModel.load_from_checkpoint('model.pt')
model.eval()  # Set the model to evaluation mode

  return torch.load(f, map_location=map_location)


RegressionModel(
  (model): XLMRobertaForSequenceClassification(
    (roberta): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-23): 24 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): XLMRoberta

In [None]:
# Prepare to store predictions and true labels
predictions = []
true_labels = test_data['score'].tolist()  # Access 'score' column from the original test_data DataFrame

In [None]:
import torch

# Step 1: Make Predictions
with torch.no_grad():  # Disable gradient calculation
    for batch in test_dataloader:
        # Move input tensors to the same device as the model (if necessary)
        batch = {key: torch.tensor(val).unsqueeze(0).to(model.device) for key, val in batch.items()}

        # Get model outputs
        outputs = model(**batch)

        # Get logits (for sentence, usually first token's logits)
        logits = outputs.logits

        # Get the predicted class label for the entire sentence
        predicted_label = torch.argmax(logits, dim=1).cpu().numpy()

        # Store the predictions
        predictions.extend(predicted_label)

# Check the lengths of true_labels and predictions
print(f"Length of true_labels: {len(true_labels)}")
print(f"Length of predictions: {len(predictions)}")

if len(true_labels) != len(predictions):
    print("WARNING: Lengths of true_labels and predictions do not match!")

Length of true_labels: 250
Length of predictions: 250


In [None]:
# Binarize true labels based on a threshold
threshold = 0.5
true_labels_binary = [1 if score > threshold else 0 for score in true_labels]

In [None]:
# Step 2: Evaluate Performance
if len(true_labels_binary) == len(predictions):
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(true_labels_binary, predictions)
    f1 = f1_score(true_labels_binary, predictions, average='weighted')  # Adjust as needed
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
else:
    print("WARNING: Lengths of true_labels and predictions do not match!")

Accuracy: 0.8840
F1 Score: 0.8830


In [None]:
output_df = pd.DataFrame({
    'Sentence': test_data['sentence'],      # Original sentences
    'True Score': test_data['score'],       # Continuous true scores
    'True Label': true_labels_binary,       # True binary labels
    'Predicted Label': predictions,         # Predicted binary labels
})

# Display rows
print(output_df)


                                              Sentence  True Score  \
0    Allons bon, tu me dis que tu n'acceptes pas qu...   -0.476638   
1    Bonjour Maloq  C'Ã©tait pas la peine de me fil...   -1.064665   
2    Moquez mon Ã©criture. Si cela ne vous plait pa...   -1.664436   
3    C'est pourquoi je pense qu'il est nÃ©cessaire ...    1.192783   
4    Dâautre part, jâai vu que lâon pouvait a...    1.255779   
..                                                 ...         ...   
245  Si la rÃ©ponse Ã©tait, Ã  travers l'histoire, ...   -2.064034   
246  Je ne sais pas comment il faut procÃ©der. Le s...    0.895263   
247  C'est de cette maniÃ¨re qu'ils se sont dÃ©fini...   -0.975411   
248  PeutÃªtre manquetil encore <url>, Ã©galement v...   -0.688790   
249  Selon toi, fautil fusionner? Si oui, fautil, c...    0.933033   

     True Label  Predicted Label  
0             0                0  
1             0                1  
2             0                0  
3             1    

In [None]:
output_df.to_csv('classification_output_with_scores_fr.csv', index=False)


In [None]:
import torch.nn.functional as F

# Prepare to store predictions, probabilities, and true labels
predictions = []
probabilities = []
true_labels_binary = [1 if score > 0.5 else 0 for score in true_labels]

with torch.no_grad():  # Disable gradient calculation
    for batch in test_dataloader:
        # Convert lists of tensors to tensors
        batch = {k: torch.stack(v) for k, v in batch.items()}

        # Move input tensors to the same device as the model (if necessary)
        for key in batch.keys():
            batch[key] = batch[key].to(model.device)

        # Get model outputs
        # outputs = model(**batch)

        # Get the predicted logits
        logits = outputs.logits

        # Apply softmax to get probabilities
        probs = F.softmax(logits, dim=1).cpu().numpy()

        # Get predicted class labels
        predicted_label = torch.argmax(logits, dim=1).cpu().numpy()

        # Store predictions and probabilities
        predictions.extend(predicted_label)
        probabilities.extend(probs)

# Create a DataFrame with sentences, true labels, predictions, and probabilities
output_df = pd.DataFrame({
    'Sentence': test_data['sentence'],
    'True Score': test_data['score'],       # Continuous true scores
    'True Label': true_labels_binary,
    'Predicted Label': predictions,
    'Politeness Probability': [prob[1] for prob in probabilities],  # Probability for the "polite" class
    'Impoliteness Probability': [prob[0] for prob in probabilities] # Probability for the "impolite" class
})

# Display rows
print(output_df)

                                              Sentence  True Label  \
0    Allons bon, tu me dis que tu n'acceptes pas qu...           0   
1    Bonjour Maloq  C'Ã©tait pas la peine de me fil...           0   
2    Moquez mon Ã©criture. Si cela ne vous plait pa...           0   
3    C'est pourquoi je pense qu'il est nÃ©cessaire ...           1   
4    Dâautre part, jâai vu que lâon pouvait a...           1   
..                                                 ...         ...   
245  Si la rÃ©ponse Ã©tait, Ã  travers l'histoire, ...           0   
246  Je ne sais pas comment il faut procÃ©der. Le s...           1   
247  C'est de cette maniÃ¨re qu'ils se sont dÃ©fini...           0   
248  PeutÃªtre manquetil encore <url>, Ã©galement v...           0   
249  Selon toi, fautil fusionner? Si oui, fautil, c...           1   

     Predicted Label  Politeness Probability  Impoliteness Probability  
0                  1                0.982543                  0.017457  
1            

In [None]:
output_df.to_csv('classification_output_with_scores_and_probsfrt.csv', index=False)