In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-arabic-tweets-debi-intake-2/sample_submission.csv
/kaggle/input/nlp-arabic-tweets-debi-intake-2/train.csv
/kaggle/input/nlp-arabic-tweets-debi-intake-2/test.csv


In [2]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: Tesla T4
Wed Apr 30 16:25:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   43C    P8             10W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+------------

### Dependencies Installation

In [3]:
!pip install gdown
!pip install pyarabic
!pip install farasapy
!pip install emoji
!pip install transformers
!git clone https://github.com/aub-mind/arabert.git

Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl.metadata (8.9 kB)
Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14
Cloning into 'arabert'...
remote: Enumerating objects: 600, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 600 (delta 38), reused 45 (delta 30), pack-reused 535 (from 1)[K
Receiving objects: 100% (600/600), 9.14 MiB | 36.57 MiB/s, done.
Resolving deltas: 100% (339/339), done.


In [4]:
Data_set = pd.read_csv("/kaggle/input/nlp-arabic-tweets-debi-intake-2/train.csv")
Data_set

Unnamed: 0,tweet,class
0,' #علمتني_الحياه أن الذين يعيشون على الأرض ليس...,pos
1,' #ميري_كرسمس كل سنة وانتم طيبين http://t.co/n...,pos
2,' و انتهى مشوار الخواجة ',neg
3,' مش عارف ابتدى مذاكره منين :/ ',neg
4,' @mskhafagi إختصروا الطريق بدلا من إختيار ال...,neg
...,...,...
2054,' @wasfa_N الجمال مبيحتاح اي مكياج لناعم وله خ...,neu
2055,' @TheMurexDor نتمني وجود الفنانة رنا سماحة اف...,neu
2056,' ولد الهدى فالكائنات ضياء .. وفم الزمان تبسم ...,pos
2057,' @mohamed71944156 @samarroshdy1 انت متناقض جد...,neg


### Dataset Preparation

In [5]:
arabic_stop_words=[]
with open ('/kaggle/input/arabic-stop-words/list.txt',encoding='utf-8') as f :
    for i in f.readlines() :
        arabic_stop_words.append(i)
        arabic_stop_words[-1]=arabic_stop_words[-1][:-1]

In [6]:
import numpy as np
import pandas as pd
import re
import string,emoji, re
import pyarabic.araby as ar
import functools, operator
import logging
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

def get_emoji_regexp():
    # Sort emoji by length to make sure multi-character emojis are matched first
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
    return re.compile(pattern)

def data_cleaning (text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"https\S+", "", text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub("(\s\d+)","",text)
    text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "", text)
    text = re.sub("\d+", " ", text)
    text = ar.strip_tashkeel(text)
    text = ar.strip_tatweel(text)
    text = text.replace("#", " ");
    text = text.replace("@", " ");
    text = text.replace("_", " ");
    
    # Remove arabic signs
    text = text[0:2] + ''.join([text[i] for i in range(2, len(text)) if text[i]!=text[i-1] or text[i]!=text[i-2]])
    text =  re.sub(r'([@A-Za-z0-9_ـــــــــــــ]+)|[^\w\s]|#|http\S+', '', text)
    text =  '' if text in arabic_stop_words else text
    from nltk.stem.isri import ISRIStemmer
    text=ISRIStemmer().stem(text)
    
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    em = text
    em_split_emoji = get_emoji_regexp().split(em)
    em_split_whitespace = [substr.split() for substr in em_split_emoji]
    em_split = functools.reduce(operator.concat, em_split_whitespace)
    text = " ".join(em_split)
    text = re.sub(r'(.)\1+', r'\1', text)
    
    text = text.replace("آ", "ا")
    text = text.replace("إ", "ا")
    text = text.replace("أ", "ا")
    text = text.replace("ؤ", "و")
    text = text.replace("ئ", "ي")
    return text

In [7]:
Data_set['tweet']=Data_set['tweet'].apply(lambda x: data_cleaning(x))
Data_set

Unnamed: 0,tweet,class
0,علمتني الحياه ان الذين يعيشون على الارض ليسوا ...,pos
1,ميري كرسمس كل سنة وانتم طيبين,pos
2,و انتهى مشوار الخواجة,neg
3,مش عارف ابتدى مذاكره منين,neg
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,neg
...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,neu
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,neu
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,pos
2057,انت متناقض جدا يا صلاح,neg


### Dataset Tokonization

In [8]:
from arabert.preprocess import ArabertPreprocessor

model_name = "UBC-NLP/MARBERT"
df = Data_set
arabert_prep = ArabertPreprocessor(model_name=model_name)
df['tweet']=Data_set['tweet'].apply(lambda x: arabert_prep.preprocess(x))
df

Unnamed: 0,tweet,class
0,علمتني الحياه ان الذين يعيشون على الارض ليسوا ...,pos
1,ميري كرسمس كل سنة وانتم طيبين,pos
2,و انتهى مشوار الخواجة,neg
3,مش عارف ابتدى مذاكره منين,neg
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,neg
...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,neu
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,neu
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,pos
2057,انت متناقض جدا يا صلاح,neg


### Label Encoding

In [9]:
from sklearn import preprocessing
lable_encoder = preprocessing.LabelEncoder()

encoded_labels=lable_encoder.fit_transform(Data_set["class"])
df['class']=encoded_labels
df

Unnamed: 0,tweet,class
0,علمتني الحياه ان الذين يعيشون على الارض ليسوا ...,2
1,ميري كرسمس كل سنة وانتم طيبين,2
2,و انتهى مشوار الخواجة,0
3,مش عارف ابتدى مذاكره منين,0
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,0
...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,1
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,1
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,2
2057,انت متناقض جدا يا صلاح,0


In [10]:
seed = 42
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation=train_test_split(df['tweet'], df['class'], test_size=0.2, random_state=seed)


### TF-IDF Embedding

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_ngram(n_gram,X_train,X_val):
    vectorizer = TfidfVectorizer(ngram_range=(n_gram,n_gram))
    x_train_vec = vectorizer.fit_transform(X_train)
    x_test_vec = vectorizer.transform(X_val)
    return x_train_vec,x_test_vec
# Applying tfidf with 1-gram, and 2-gram
tfidf_1g_transformation_train,tfidf_1g_transformation_validation= tfidf_ngram(1,X_train,X_validation)
tfidf_2g_transformation_train,tfidf_2g_transformation_validation= tfidf_ngram(2,X_train,X_validation)


In [13]:
import pandas as pd
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Models and embeddings setup
models = [
    SVC(),
    KNeighborsClassifier(),
    XGBClassifier(),
    RandomForestClassifier(),
    DecisionTreeClassifier(),
    LogisticRegression(),
    MultinomialNB()
]

text_embedding = {
    'TF_IDF 1_gram': (tfidf_1g_transformation_train, tfidf_1g_transformation_validation),
    'TF_IDF 2_gram': (tfidf_2g_transformation_train, tfidf_2g_transformation_validation)
}

# Store results
highest_test_accuracy = 0
champion_model_name = ''
champion_model = None
champion_embedding = ''
results_dict = {
    'Model Name': [],
    'Embedding type': [],
    'Training Accuracy': [],
    'Testing Accuracy': []
}

# Evaluate models
for model in models:
    for embedding_vector in text_embedding:
        train = text_embedding[embedding_vector][0]
        test = text_embedding[embedding_vector][1]

        model.fit(train, y_train)
        train_acc = model.score(train, y_train)
        test_acc = model.score(test, y_validation)

        model_name = type(model).__name__.replace("Classifier", "")
        results_dict['Model Name'].append(model_name)
        results_dict['Embedding type'].append(embedding_vector)
        results_dict['Training Accuracy'].append(train_acc)
        results_dict['Testing Accuracy'].append(test_acc)

        if test_acc > highest_test_accuracy:
            highest_test_accuracy = test_acc
            champion_model_name = model_name
            champion_model = model
            champion_embedding = embedding_vector

# Display all accuracies
results_df = pd.DataFrame(results_dict)
print("All Model Accuracies:\n")
print(results_df.to_string(index=False))

# Print the champion model info
print("\nChampion Model:")
print(f"Name      : {champion_model_name}")
print(f"Embedding : {champion_embedding}")
print(f"Accuracy  : {highest_test_accuracy:.4f}")


All Model Accuracies:

        Model Name Embedding type  Training Accuracy  Testing Accuracy
               SVC  TF_IDF 1_gram           0.996357          0.485437
               SVC  TF_IDF 2_gram           0.986642          0.356796
        KNeighbors  TF_IDF 1_gram           0.343048          0.349515
        KNeighbors  TF_IDF 2_gram           0.961141          0.344660
               XGB  TF_IDF 1_gram           0.852459          0.480583
               XGB  TF_IDF 2_gram           0.483303          0.354369
      RandomForest  TF_IDF 1_gram           0.999393          0.456311
      RandomForest  TF_IDF 2_gram           0.987250          0.378641
      DecisionTree  TF_IDF 1_gram           0.999393          0.439320
      DecisionTree  TF_IDF 2_gram           0.987250          0.351942
LogisticRegression  TF_IDF 1_gram           0.962963          0.507282
LogisticRegression  TF_IDF 2_gram           0.986642          0.383495
     MultinomialNB  TF_IDF 1_gram           0.938676  

In [34]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer =AutoTokenizer.from_pretrained('UBC-NLP/MARBERT')
model = AutoModelForSequenceClassification.from_pretrained('UBC-NLP/MARBERT', num_labels=3)
#-----------------------------------
# Tokenize the sentences using bert tokenizer
df["bert_tokens"] = df.tweet.apply(lambda x: tokenizer(x).tokens())
df["bert_tokens_ids"] = df.tweet.apply(lambda x: tokenizer(x).tokens())
df["encoded"] = df.tweet.apply(lambda x: tokenizer.encode_plus(x,return_tensors='pt')['input_ids'])
df

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,tweet,class,bert_tokens,bert_tokens_ids,encoded
0,علمتني الحياه ان الذين يعيشون على الارض ليسوا ...,2,"[[CLS], علمتني, الحياه, ان, الذين, يعيشون, على...","[[CLS], علمتني, الحياه, ان, الذين, يعيشون, على...","[[tensor(2), tensor(8244), tensor(3946), tenso..."
1,ميري كرسمس كل سنة وانتم طيبين,2,"[[CLS], ميري, كرس, ##مس, كل, سنة, وانتم, طيبين...","[[CLS], ميري, كرس, ##مس, كل, سنة, وانتم, طيبين...","[[tensor(2), tensor(53902), tensor(35685), ten..."
2,و انتهى مشوار الخواجة,0,"[[CLS], و, انتهى, مشوار, الخواجة, [SEP]]","[[CLS], و, انتهى, مشوار, الخواجة, [SEP]]","[[tensor(2), tensor(144), tensor(7609), tensor..."
3,مش عارف ابتدى مذاكره منين,0,"[[CLS], مش, عارف, ابتدى, مذاكره, منين, [SEP]]","[[CLS], مش, عارف, ابتدى, مذاكره, منين, [SEP]]","[[tensor(2), tensor(2093), tensor(3323), tenso..."
4,اختصروا الطريق بدلا من اختيار المنصف ثم الانقل...,0,"[[CLS], اختصر, ##وا, الطريق, بدلا, من, اختيار,...","[[CLS], اختصر, ##وا, الطريق, بدلا, من, اختيار,...","[[tensor(2), tensor(22181), tensor(1958), tens..."
...,...,...,...,...,...
2054,الجمال مبيحتاح اي مكياج لناعم وله خشن جمل الطا...,1,"[[CLS], الجمال, مبيح, ##تاح, اي, مكياج, لنا, #...","[[CLS], الجمال, مبيح, ##تاح, اي, مكياج, لنا, #...","[[tensor(2), tensor(4770), tensor(68899), tens..."
2055,نتمني وجود الفنانة رنا سماحة افضل فنانة صاعدة ...,1,"[[CLS], نتمني, وجود, الفنانة, رنا, سماحة, افضل...","[[CLS], نتمني, وجود, الفنانة, رنا, سماحة, افضل...","[[tensor(2), tensor(39939), tensor(3715), tens..."
2056,ولد الهدى فالكاينات ضياء وفم الزمان تبسم وسناء...,2,"[[CLS], ولد, الهدى, فالك, ##اينات, ضياء, وف, #...","[[CLS], ولد, الهدى, فالك, ##اينات, ضياء, وف, #...","[[tensor(2), tensor(3735), tensor(4880), tenso..."
2057,انت متناقض جدا يا صلاح,0,"[[CLS], انت, متناقض, جدا, يا, صلاح, [SEP]]","[[CLS], انت, متناقض, جدا, يا, صلاح, [SEP]]","[[tensor(2), tensor(2030), tensor(27008), tens..."


In [35]:
# Number of training epochs
epochs = 20
# Select the max sentance lenth
MAX_LEN = 80
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size =64

In [None]:
!pip install keras-preprocessing

In [36]:
from keras_preprocessing.sequence import pad_sequences

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in df['bert_tokens']]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [37]:
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [38]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, encoded_labels,
                                                             random_state=seed, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                             random_state=seed, test_size=0.1)

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)


train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size)


In [39]:
import torch.optim as optim


param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay_rate': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}]

# This variable contains all of the hyperparemeter information our training loop needs
optimizer = optim.AdamW(optimizer_grouped_parameters,lr=.000005)

In [40]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
#---------------------------------------------
from tqdm import trange
import numpy as np

t = []
# Store our loss and accuracy for plotting
train_loss_set = []

if torch.cuda.is_available():
    # Transfer the model to GPU
    model.to("cuda")
# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    b_input_ids, b_input_mask, b_labels = batch
    b_labels = b_labels.type(torch.LongTensor)   # casting to long
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    if torch.cuda.is_available():
        loss = model(b_input_ids.to("cuda"), token_type_ids=None, attention_mask=b_input_mask.to("cuda"), labels=b_labels.to("cuda"))["loss"]
    else:
        loss = model(b_input_ids.to("cpu"), token_type_ids=None, attention_mask=b_input_mask.to("cpu"), labels=b_labels.to("cpu"))["loss"]

    train_loss_set.append(loss.item())
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()

    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
  # Validation
  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()
  # Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    # batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    b_labels = b_labels.type(torch.LongTensor)   # casting to long
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
        if torch.cuda.is_available():
            logits = model(b_input_ids.to("cuda"), token_type_ids=None, attention_mask=b_input_mask.to("cuda"))
        else:
            logits = model(b_input_ids.to("cpu"), token_type_ids=None, attention_mask=b_input_mask.to("cpu"))

    # Move logits and labels to CPU
    logits = logits["logits"].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
  if (eval_accuracy/nb_eval_steps) > 0.78:
    break

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Train loss: 1.0825892851270478


Epoch:   5%|▌         | 1/20 [00:25<08:06, 25.63s/it]

Validation Accuracy: 0.47265625
Train loss: 0.9461242186612097


Epoch:  10%|█         | 2/20 [00:51<07:46, 25.92s/it]

Validation Accuracy: 0.7003348214285714
Train loss: 0.7086904788839405


Epoch:  15%|█▌        | 3/20 [01:16<07:08, 25.21s/it]

Validation Accuracy: 0.7276785714285714
Train loss: 0.5101674651277477


Epoch:  20%|██        | 4/20 [01:41<06:42, 25.14s/it]

Validation Accuracy: 0.7544642857142857
Train loss: 0.3787607478684393


Epoch:  20%|██        | 4/20 [02:06<08:27, 31.72s/it]

Validation Accuracy: 0.7979910714285714





In [41]:
df_submit = pd.read_csv("/kaggle/input/nlp-arabic-tweets-debi-intake-2/test.csv")
df_submit["tweet"] = df_submit.tweet.apply(lambda x: data_cleaning(x))
df_submit['tweet']=df_submit['tweet'].apply(lambda x: arabert_prep.preprocess(x))
# Tokenize the sentences using bert tokenizer
df_submit["bert_tokens"] = df_submit.tweet.apply(lambda x: tokenizer(x).tokens())
#---------------------------------
input_ids_submit = [tokenizer.convert_tokens_to_ids(x) for x in df_submit["bert_tokens"]]
input_ids_submit = pad_sequences(input_ids_submit, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
attention_masks_submit = []
for seq in input_ids_submit:
    seq_mask = [float(i>0) for i in seq]
    attention_masks_submit.append(seq_mask)
    
inputs_submit = torch.tensor(input_ids_submit)
masks_submit = torch.tensor(attention_masks_submit)
submit_data = TensorDataset(inputs_submit, masks_submit)
submit_dataloader = DataLoader(submit_data, batch_size=batch_size)
model.eval()
if torch.cuda.is_available():
    model.to("cuda")

outputs = []
for input, masks in submit_dataloader:
    torch.cuda.empty_cache() # empty the gpu memory
    if torch.cuda.is_available():
        # Transfer the batch to gpu
        input = input.to('cuda')
        masks = masks.to('cuda')
    # Run inference on the batch
    output = model(input, attention_mask=masks)["logits"]
    # Transfer the output to CPU again and convert to numpy
    output = output.cpu().detach().numpy()
    # Store the output in a list
    outputs.append(output)
# Concatenate all the lists within the list into one list
outputs = [x for y in outputs for x in y]
# Inverse transform the label encoding
pred_flat = np.argmax(outputs, axis=1).flatten()
output_labels = lable_encoder.inverse_transform(pred_flat)

In [42]:
submission = pd.DataFrame({"Id":np.arange(1, len(output_labels)+1), "class":output_labels})
submission.to_csv("/kaggle/working/submission.csv", index=False)