In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [3]:
SEED = 555
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# **1. Understanding BERT and XLM-RoBERTa**

## **Objective: Learn how transformer models work and their role in NLP tasks.**

Each variation of BERT has unique strengths. Depending on the task, the appropriate model might be:

RoBERTa for large-scale text understanding.
ALBERT for efficient NLP in resource-constrained environments.
DistilBERT for fast real-time processing.
ELECTRA for efficient pretraining.
XLM-RoBERTa for multilingual applications.

# **2. Tokenizing Text**

# **Objective: Understand how to tokenize text using pre-trained tokenizers.**

In [9]:
from transformers import BertTokenizer, XLMRobertaTokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
xlm_roberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [31]:
def bert_tokenize(premise, hypothesis=None):
    kwargs = {
        "text":premise,
        "padding":'max_length',
        "truncation":True,
        "max_length":128,
        "return_tensors":'pt',
        "return_token_type_ids":True,
        "return_attention_mask":True
    }

    if hypothesis is not None:
      kwargs['text_pair'] = hypothesis

    return bert_tokenizer.encode_plus(**kwargs)

In [32]:
def alberta_tokenize(premise, hypothesis=None):
    kwargs = {
        "text":premise,
        "padding":'max_length',
        "truncation":True,
        "max_length":128,
        "return_tensors":'pt',
        "return_token_type_ids":True,
        "return_attention_mask":True
    }

    if hypothesis is not None:
      kwargs['text_pair'] = hypothesis

    return xlm_roberta_tokenizer.encode_plus(**kwargs)

In [33]:
premise = 'I love choclate'
bert_encoded = bert_tokenize(premise)

print("BERT input_ids       :", bert_encoded['input_ids'])
print("BERT attention_mask  :", bert_encoded['attention_mask'])
print("BERT token_type_ids  :", bert_encoded['token_type_ids'])
print("BERT decoded         :", bert_tokenizer.decode(bert_encoded['input_ids'][0], skip_special_tokens=True))

BERT input_ids       : tensor([[  101,  1045,  2293, 16480, 20464,  3686,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,

In [40]:
premise = "J'adore le chocolat"
xlm_encoded = alberta_tokenize(premise)

print("XLM-R input_ids      :", xlm_encoded['input_ids'][0])
print("XLM-R attention_mask :", xlm_encoded['attention_mask'][0])
print("XLM-R decoded        :", xlm_roberta_tokenizer.decode(xlm_encoded['input_ids'][0], skip_special_tokens=True))

XLM-R input_ids      : tensor([     0,    821,     25,  71587,     95, 167714,      2,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,  

In [38]:
premise = 'I love choclate'
hypothesis = 'I hate choclate'
bert_encoded = bert_tokenize(premise, hypothesis)

print("BERT input_ids       :", bert_encoded['input_ids'])
print("BERT attention_mask  :", bert_encoded['attention_mask'])
print("BERT token_type_ids  :", bert_encoded['token_type_ids'])
print("BERT decoded         :", bert_tokenizer.decode(bert_encoded['input_ids'][0], skip_special_tokens=True))

BERT input_ids       : tensor([[  101,  1045,  2293, 16480, 20464,  3686,   102,  1045,  5223, 16480,
         20464,  3686,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,

In [41]:
premise = "J'adore le chocolat"
hypothesis = "Je deteste le chocolat"
xlm_encoded = alberta_tokenize(premise, hypothesis)

print("XLM-R input_ids      :", xlm_encoded['input_ids'][0])
print("XLM-R attention_mask :", xlm_encoded['attention_mask'][0])
print("XLM-R decoded        :", xlm_roberta_tokenizer.decode(xlm_encoded['input_ids'][0], skip_special_tokens=True))

XLM-R input_ids      : tensor([     0,    821,     25,  71587,     95, 167714,      2,      2,    845,
           149,   4896,     95, 167714,      2,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,  

# **3. Loading and Exploring the Dataset**

## **Objective: Load the dataset and explore its structure.**

In [5]:
!unzip "/content/drive/MyDrive/GENAI/Week7/Day1/Basics of BERT and XLM-RoBERTa - PyTorch - 2.zip"

Archive:  /content/drive/MyDrive/GENAI/Week7/Day1/Basics of BERT and XLM-RoBERTa - PyTorch - 2.zip
  inflating: Basics of BERT and XLM-RoBERTa - PyTorch/sample_submission.csv  
 extracting: Basics of BERT and XLM-RoBERTa - PyTorch/test.csv.zip  
 extracting: Basics of BERT and XLM-RoBERTa - PyTorch/train.csv.zip  


In [6]:
!unzip "/content/Basics of BERT and XLM-RoBERTa - PyTorch/test.csv.zip"
!unzip "/content/Basics of BERT and XLM-RoBERTa - PyTorch/train.csv.zip"

Archive:  /content/Basics of BERT and XLM-RoBERTa - PyTorch/test.csv.zip
  inflating: test.csv                
Archive:  /content/Basics of BERT and XLM-RoBERTa - PyTorch/train.csv.zip
  inflating: train.csv               


In [7]:
file_train = pd.read_csv("/content/train.csv")
df_train = pd.DataFrame(file_train)
df_train.drop_duplicates(inplace=True)
df_train.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [42]:
df_train.shape

(12120, 6)

In [43]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12120 entries, 0 to 12119
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          12120 non-null  object
 1   premise     12120 non-null  object
 2   hypothesis  12120 non-null  object
 3   lang_abv    12120 non-null  object
 4   language    12120 non-null  object
 5   label       12120 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 568.3+ KB


In [47]:
df_train.isna().sum()

Unnamed: 0,0
id,0
premise,0
hypothesis,0
lang_abv,0
language,0
label,0


In [8]:
file_test = pd.read_csv("/content/test.csv")
df_test = pd.DataFrame(file_test)
df_test.drop_duplicates(inplace=True)
df_test.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language
0,c6d58c3f69,بکس، کیسی، راہیل، یسعیاہ، کیلی، کیلی، اور کولم...,"کیسی کے لئے کوئی یادگار نہیں ہوگا, کولمین ہائی...",ur,Urdu
1,cefcc82292,هذا هو ما تم نصحنا به.,عندما يتم إخبارهم بما يجب عليهم فعله ، فشلت ال...,ar,Arabic
2,e98005252c,et cela est en grande partie dû au fait que le...,Les mères se droguent.,fr,French
3,58518c10ba,与城市及其他公民及社区组织代表就IMA的艺术发展进行对话&amp,IMA与其他组织合作，因为它们都依靠共享资金。,zh,Chinese
4,c32b0d16df,Она все еще была там.,"Мы думали, что она ушла, однако, она осталась.",ru,Russian


In [44]:
df_test.shape

(5195, 5)

In [45]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5195 entries, 0 to 5194
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          5195 non-null   object
 1   premise     5195 non-null   object
 2   hypothesis  5195 non-null   object
 3   lang_abv    5195 non-null   object
 4   language    5195 non-null   object
dtypes: object(5)
memory usage: 203.1+ KB


In [48]:
df_test.isna().sum()

Unnamed: 0,0
id,0
premise,0
hypothesis,0
lang_abv,0
language,0


Pour l'entrainement du modèle, on consèrve la colonne  premise qui contient la première phrase, la colonne hypothesis qui contient la seconde phrase à comparer, et de la colonne label qui indique la relation entre les deux (comme contradiction, neutralité ou entailment).

In [46]:
df_train_cleaned = df_train[['premise', 'hypothesis', 'label']]
df_test_cleaned = df_test[['premise', 'hypothesis']]

# **4. Preparing Input Data for the Model**

# **Objective: Format input data correctly for transformer models.**

# **5. Creating Cross-Validation Folds**

## **Objective: Implement k-fold cross-validation for training.**

In [71]:
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import TensorDataset
import torch

tokenized_train_folds = []
tokenized_val_folds = []

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = df_train_cleaned[['premise', 'hypothesis']]
y = df_train_cleaned['label']

for fold_idx, (train_index, val_index) in enumerate(kf.split(X, y)):

    train_df = df_train_cleaned.iloc[train_index].reset_index(drop=True)
    val_df = df_train_cleaned.iloc[val_index].reset_index(drop=True)

    enc_train = xlm_roberta_tokenizer(
      text=train_df['premise'].tolist(),
      text_pair=train_df['hypothesis'].tolist(),
      padding='max_length',
      truncation=True,
      max_length=128,
      return_tensors='pt',
      return_attention_mask=True,
      return_token_type_ids=True
    )
    labels_train = torch.tensor(train_df['label'].tolist())
    train_dataset = TensorDataset(
    enc_train['input_ids'],
    enc_train['attention_mask'],
    enc_train['token_type_ids'],
    labels_train
    )

    enc_val = xlm_roberta_tokenizer(
        val_df['premise'].tolist(),
        val_df['hypothesis'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt',
        return_attention_mask=True,
        return_token_type_ids=True
    )
    labels_val = torch.tensor(val_df['label'].tolist())
    val_dataset = TensorDataset(
        enc_val['input_ids'],
        enc_val['attention_mask'],
        enc_val['token_type_ids'],
        labels_val
    )

    tokenized_train_folds.append(train_dataset)
    tokenized_val_folds.append(val_dataset)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [74]:
from torch.utils.data import DataLoader

train_loaders = []
val_loaders = []

for train_set, val_set in zip(tokenized_train_folds, tokenized_val_folds):
  train_loaders.append(DataLoader(train_set, batch_size=32, shuffle=True))
  val_loaders.append(DataLoader(val_set, batch_size=32, shuffle=False))

In [76]:
from transformers import XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=3)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [79]:
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()

In [84]:
from logging import log
from tqdm import tqdm

all_fold_accuracies = list()

for fold_idx in range(5):
    print(f"\n🔁 Fold {fold_idx + 1}/5")

    model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=3)
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    loss_fn = nn.CrossEntropyLoss()

    train_loader = train_loaders[fold_idx]
    val_loader = val_loaders[fold_idx]

    for epoch in range(3):
        print(f"  📚 Epoch {epoch+1}/3")
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for batch in tqdm(train_loader, desc=f"    ⏳ Training Progress", leave=False):
            input_ids, attention_mask, token_type_ids, labels = batch

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
            logits = outputs.logits
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        accuracy = correct / total
        print(f"    ✅ Epoch {epoch+1}: loss={total_loss:.4f}, accuracy={accuracy:.4f}")

    model.eval()
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, token_type_ids, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    val_accuracy = val_correct / val_total
    all_fold_accuracies.append(val_accuracy)
    print(f"✅ Fold {fold_idx+1} - Validation accuracy : {val_accuracy:.4f}")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔁 Fold 1/5
  📚 Epoch 1/3




    ✅ Epoch 1: loss=329.5075, accuracy=0.3772
  📚 Epoch 2/3




    ✅ Epoch 2: loss=288.9464, accuracy=0.5436
  📚 Epoch 3/3




    ✅ Epoch 3: loss=220.8579, accuracy=0.6925
✅ Fold 1 - Validation accuracy : 0.7021

🔁 Fold 2/5


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  📚 Epoch 1/3




    ✅ Epoch 1: loss=331.8251, accuracy=0.3617
  📚 Epoch 2/3




    ✅ Epoch 2: loss=288.5960, accuracy=0.5341
  📚 Epoch 3/3




    ✅ Epoch 3: loss=217.3291, accuracy=0.7016
✅ Fold 2 - Validation accuracy : 0.7116

🔁 Fold 3/5


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  📚 Epoch 1/3




    ✅ Epoch 1: loss=324.4819, accuracy=0.4144
  📚 Epoch 2/3




    ✅ Epoch 2: loss=271.9197, accuracy=0.5900
  📚 Epoch 3/3




    ✅ Epoch 3: loss=212.1642, accuracy=0.7090
✅ Fold 3 - Validation accuracy : 0.7059

🔁 Fold 4/5


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  📚 Epoch 1/3




    ✅ Epoch 1: loss=315.1713, accuracy=0.4376
  📚 Epoch 2/3




    ✅ Epoch 2: loss=250.9921, accuracy=0.6349
  📚 Epoch 3/3




    ✅ Epoch 3: loss=196.6114, accuracy=0.7292
✅ Fold 4 - Validation accuracy : 0.6951

🔁 Fold 5/5


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  📚 Epoch 1/3




    ✅ Epoch 1: loss=328.3474, accuracy=0.3899
  📚 Epoch 2/3




    ✅ Epoch 2: loss=270.1563, accuracy=0.5990
  📚 Epoch 3/3




    ✅ Epoch 3: loss=208.0490, accuracy=0.7148
✅ Fold 5 - Validation accuracy : 0.7026


In [85]:
enc_test = xlm_roberta_tokenizer(
    df_test_cleaned['premise'].tolist(),
    df_test_cleaned['hypothesis'].tolist(),
    padding='max_length',
    truncation=True,
    max_length=128,
    return_attention_mask=True,
    return_token_type_ids=True,
    return_tensors='pt'
)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [86]:
test_dataset = TensorDataset(
    enc_test['input_ids'],
    enc_test['attention_mask'],
    enc_test['token_type_ids']
)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [89]:
model.eval()
test_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, token_type_ids = [x.to(device) for x in batch]
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())

In [90]:
df_test_cleaned["predicted_label"] = test_preds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_cleaned["predicted_label"] = test_preds


In [91]:
df_test_cleaned.head(20)

Unnamed: 0,premise,hypothesis,predicted_label
0,بکس، کیسی، راہیل، یسعیاہ، کیلی، کیلی، اور کولم...,"کیسی کے لئے کوئی یادگار نہیں ہوگا, کولمین ہائی...",2
1,هذا هو ما تم نصحنا به.,عندما يتم إخبارهم بما يجب عليهم فعله ، فشلت ال...,2
2,et cela est en grande partie dû au fait que le...,Les mères se droguent.,0
3,与城市及其他公民及社区组织代表就IMA的艺术发展进行对话&amp,IMA与其他组织合作，因为它们都依靠共享资金。,1
4,Она все еще была там.,"Мы думали, что она ушла, однако, она осталась.",1
5,His family had lost a son and a daughter now.,The son and daughter had lost their father.,0
6,Steps are initiated to allow program board mem...,There's enough room for 35-40 positions on the...,1
7,C'était probablement la première chose dont je...,C'était l'un de mes premiers souvenirs.,0
8,"agencies' operating trust, enterprise and inte...",Agencies in financial trouble are usually audi...,1
9,Hakuna aliyejua walipokwenda.,Mafiko yao ilikuwa ni siri,2
