# Imports and Installs

In [None]:
# Installing additional libraries for text preprocessing
!pip install -q preprocessor
!pip install -q contractions
!pip install -q optuna
!pip install holidays
!pip install gensim



In [None]:
!pip install datasets
# Connect to Google Drive
from google.colab import files
import gdown

# Basic Python
import pandas as pd
import numpy as np
from datetime import datetime
import holidays

# Text Preprocessing
import re
import preprocessor
import contractions
from tqdm import tqdm
tqdm.pandas()

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Feature Vectorization
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

# Optimization
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.metrics import f1_score, classification_report, make_scorer

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='optuna.distributions')

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizerFast
from transformers import TrainerCallback, TrainerState, TrainerControl
from datasets import Dataset, Features, Value, ClassLabel



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Get The Data

In [None]:
# Prompt the user to upload the file
uploaded = files.upload()

# Assuming the file is named 'trump_train.tsv'
file_name = list(uploaded.keys())[0]

# Define the column names
column_names = ['tweet id', 'user handle', 'tweet text', 'time stamp', 'device']

# Load the TSV file into a DataFrame with specified column names
df = pd.read_csv(file_name, sep='\t', names=column_names, header=None)

# Display the first few rows of the DataFrame
print(df.head())

# Display information about the DataFrame
print(df.info())

# Display summary statistics of the DataFrame
print(df.describe())

Saving trump_train.tsv to trump_train.tsv
             tweet id      user handle  \
0  845974102619906048  realDonaldTrump   
1  846166053663191040  realDonaldTrump   
2  835814988686233601  realDonaldTrump   
3  835817351178301440  realDonaldTrump   
4  835916511944523777  realDonaldTrump   

                                          tweet text           time stamp  \
0  Democrats are smiling in D.C. that the Freedom...  2017-03-26 15:21:58   
1  General Kelly is doing a great job at the bord...  2017-03-27 04:04:42   
2  The race for DNC Chairman was, of course, tota...  2017-02-26 13:33:16   
3  For first time the failing @nytimes will take ...  2017-02-26 13:42:39   
4  Russia talk is FAKE NEWS put out by the Dems, ...  2017-02-26 20:16:41   

    device  
0   iphone  
1   iphone  
2  android  
3  android  
4  android  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3156 entries, 0 to 3155
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------      

# Text Preprocess

**Our data cleaning process handles a few modifications:**
1. Define regular expressions for different unwanted patterns and normalize them.

2. Constructions are expanded to the full shape of the word and other unwanted chars are removed.

3. Cleans the text to retain only alphanumeric characters and common punctuation.

4. Converts text to lowercase and removes stop words (user's choice).

5. Lemmatization

6. Removal of rows with missing values.

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define regex patterns
re_url = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
re_html = re.compile(r'<[^<]+?>')
re_date = re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{2,4}[/-]\d{1,2}[/-]\d{1,2}\b')
re_decimal = re.compile(r'\b\d+\.\d+%?\b')
re_number_percent = re.compile(r'\b\d+%?\b')
re_char = re.compile(r'[^0-9a-zA-Z\s?!.,:\'\"//]+')

# Function for cleaning text
def PreProcessText(text, lower=False, remove_stopwords=True):
    # Replacement mappings for common misencoded characters
    replacements = {
        '“': '"', '”': '"', '‘': "'", '’': "'",
        'â\x80\x9c': '"', 'â\x80\x9d': '"', 'â\x80\x99': "'"
    }

    for bad_char, good_char in replacements.items():
        text = text.replace(bad_char, good_char)

    text = contractions.fix(text)  # Expand contractions
    text = re_html.sub('', text)  # Remove HTML tags
    text = re_url.sub('[URL]', text)  # Replace URLs with placeholder
    text = re_date.sub('[DATE]', text)  # Replace dates with placeholder
    text = re_decimal.sub('[DECIMAL]', text)  # Replace decimal numbers with placeholder
    text = re_number_percent.sub('[NUM]', text)  # Replace whole numbers with placeholder
    text = re_char.sub("", text)  # Remove non-alphanumeric characters

    words = text.split()
    placeholders = ['[URL]', '[DATE]', '[DECIMAL]', '[NUM]']

    if lower:
        words = [word.lower() for word in words if word not in placeholders]  # Convert to lowercase

    if remove_stopwords:
        words = [word for word in words if word not in stop_words]  # Remove stopwords

    words = [lemmatizer.lemmatize(word) if word not in placeholders else word for word in words]  # Lemmatize words

    text = ' '.join(words)  # Join words back into a single string

    return text

In [None]:
df["cleaned text"] = df["tweet text"].progress_apply(PreProcessText)
df.dropna(inplace=True)

100%|██████████| 3156/3156 [00:06<00:00, 459.25it/s] 


In [None]:
df.head()

Unnamed: 0,tweet id,user handle,tweet text,time stamp,device,cleaned text
0,845974102619906048,realDonaldTrump,Democrats are smiling in D.C. that the Freedom...,2017-03-26 15:21:58,iphone,"Democrats smiling D.C. Freedom Caucus, help Cl..."
1,846166053663191040,realDonaldTrump,General Kelly is doing a great job at the bord...,2017-03-27 04:04:42,iphone,General Kelly great job border. Numbers way do...
2,835814988686233601,realDonaldTrump,"The race for DNC Chairman was, of course, tota...",2017-02-26 13:33:16,android,"The race DNC Chairman was, course, totally ""ri..."
3,835817351178301440,realDonaldTrump,For first time the failing @nytimes will take ...,2017-02-26 13:42:39,android,For first time failing nytimes take ad bad one...
4,835916511944523777,realDonaldTrump,"Russia talk is FAKE NEWS put out by the Dems, ...",2017-02-26 20:16:41,android,"Russia talk FAKE NEWS put Dems, played media, ..."


It is known that Trump is using an Android phone, while his staff mostly uses iPhone. We'll treat tweets as Trump by D:evice and Username

In [None]:
def classify_tweet(device, username, timestamp):
    """
    Classifies a tweet as being from Trump or his staff based on the device, username, and timestamp.

    Parameters:
    device (str): The device used to send the tweet.
    username (str): The username who sent the tweet.
    timestamp (str): The timestamp of the tweet.

    Returns:
    int: 1 if the tweet is classified as being from Trump, otherwise 0.
    None: if there is an error in classification.
    """
    try:
        trump_username = 'realDonaldTrump'
        trump_cutoff_date = datetime.strptime('2017-04-01', '%Y-%m-%d')

        tweet_date = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')  # Adjust the format as per your data

        if username.lower() == trump_username.lower() and 'android' in device.lower() and tweet_date < trump_cutoff_date:
            return 0
        elif tweet_date >= trump_cutoff_date: # Can't tell anymore if trump or no
            return None
        else:
            return 1
    except Exception as e:
        # Log the error if needed
        return None

# Apply the classification function to the DataFrame
df['notTrump?'] = df.apply(lambda row: classify_tweet(row['device'], row['user handle'], row['time stamp']), axis=1)
df = df.dropna(subset=['notTrump?'])

# Count the occurrences of each class (Trump or Staff)
class_counts = df['notTrump?'].value_counts()

# Display the counts for each class
print("Counts of each class (Trump or Staff):")
print(class_counts)

Counts of each class (Trump or Staff):
notTrump?
0.0    1991
1.0    1142
Name: count, dtype: int64


In [None]:
# Split the data into training, validation sets
# Model's will be optimized (optuna) on validation set, then assessd using cross validation on the train.
X = df.drop(columns=['notTrump?']).copy()
y = df['notTrump?'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Classifier of Choice: BERT

### BERT

In [None]:
!pip install transformers datasets accelerate evaluate
!pip install --upgrade accelerate transformers
!pip install accelerate -U
import pandas as pd
from datasets import Dataset, Features, ClassLabel, Value
from transformers import AutoTokenizer, TrainingArguments, Trainer, BertForSequenceClassification
import numpy as np
import evaluate
import torch



Step 1: Prepare the Data - Here we convert the dataframes to Hugging Face datasets

In [None]:
# Concatenate tweet text and labels, remove the index column
train_data = pd.concat([X_train['cleaned text'].reset_index(drop=True), y_train.reset_index(drop=True).rename('label')], axis=1)
test_data = pd.concat([X_test['cleaned text'].reset_index(drop=True), y_test.reset_index(drop=True).rename('label')], axis=1)

# Define the features including ClassLabel for the label column
features = Features({
    'cleaned text': Value(dtype='string', id=None),
    'label': ClassLabel(num_classes=2, names=["0", "1"], id=None)
})

# Convert to Hugging Face dataset format with the specified features
train_dataset = Dataset.from_pandas(train_data, features=features)
test_dataset = Dataset.from_pandas(test_data, features=features)

Step 2: Tokenize the text data

In [None]:
# Load the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples["cleaned text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2506 [00:00<?, ? examples/s]

Map:   0%|          | 0/627 [00:00<?, ? examples/s]

Step 3: Metrics Calculation

In [None]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Step 4: Train

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [None]:
class SaveWeightsCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        epoch = state.epoch
        save_path = f"{args.output_dir}/pytorch_model_epoch_{int(epoch)}.bin"
        torch.save(model.state_dict(), save_path)
        print(f"Model weights saved to {save_path} at the end of epoch {int(epoch)}.")
        files.download(save_path)


# Define training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",                # Directory to save the model and checkpoints.
    evaluation_strategy="epoch",              # Evaluate the model at the end of each epoch.
    logging_dir='./logs',                     # Directory to save the logs.
    logging_steps=10,                         # Log every 10 steps.
    per_device_train_batch_size=8,            # Batch size for training.
    per_device_eval_batch_size=8,             # Batch size for evaluation.
    num_train_epochs=6,                       # Number of epochs to train for.
    weight_decay=0.01,                        # Weight decay for the optimizer.
    save_strategy="epoch",                    # Save model at the end of each epoch.
    save_total_limit=6                        # Limit the total number of checkpoints.
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[SaveWeightsCallback]  # Add the custom callback here
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.5281,0.334871
2,0.3754,0.404153
3,0.2254,0.535581


Model weights saved to test_trainer/pytorch_model_epoch_1.bin at the end of epoch 1.
Model weights saved to test_trainer/pytorch_model_epoch_2.bin at the end of epoch 2.
Model weights saved to test_trainer/pytorch_model_epoch_3.bin at the end of epoch 3.


## RoBERTa

In [None]:
!pip install transformers datasets accelerate evaluate
!pip install --upgrade accelerate transformers
import pandas as pd
from datasets import Dataset, Features, ClassLabel, Value
from transformers import RobertaTokenizerFast, TrainingArguments, Trainer, RobertaForSequenceClassification, DistilBertForSequenceClassification
from transformers import TrainerCallback, TrainerState, TrainerControl
import numpy as np
import torch
import evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



In [None]:
# Define the custom callback for saving weights
class SaveWeightsCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        epoch = state.epoch
        save_path = f"{args.output_dir}/pytorch_model_epoch_{int(epoch)}.bin"
        torch.save(model.state_dict(), save_path)
        print(f"Model weights saved to {save_path} at the end of epoch {int(epoch)}.")
        files.download(save_path)

# Concatenate tweet text and labels, remove the index column
train_data = pd.concat([X_train['cleaned text'].reset_index(drop=True), y_train.reset_index(drop=True).rename('label')], axis=1)
test_data = pd.concat([X_test['cleaned text'].reset_index(drop=True), y_test.reset_index(drop=True).rename('label')], axis=1)

# Define the features including ClassLabel for the label column
features = Features({
    'cleaned text': Value(dtype='string', id=None),
    'label': ClassLabel(num_classes=2, names=["0", "1"], id=None)
})

# Convert to Hugging Face dataset format with the specified features
train_dataset = Dataset.from_pandas(train_data, features=features)
test_dataset = Dataset.from_pandas(test_data, features=features)

# Load the tokenizer and model
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples["cleaned text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
tokenized_train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Define training arguments with reduced batch size and gradient accumulation
training_args = TrainingArguments(
    output_dir="test_trainer",                # Directory to save the model and checkpoints.
    evaluation_strategy="epoch",              # Evaluate the model at the end of each epoch.
    logging_dir='./logs',                     # Directory to save the logs.
    logging_steps=10,                         # Log every 10 steps.
    per_device_train_batch_size=4,            # Reduced batch size for training.
    per_device_eval_batch_size=4,             # Reduced batch size for evaluation.
    gradient_accumulation_steps=4,            # Accumulate gradients over 4 steps.
    num_train_epochs=6,                       # Number of epochs to train for.
    weight_decay=0.01,                        # Weight decay for the optimizer.
    save_strategy="epoch",                    # Save model at the end of each epoch.
    save_total_limit=6,                       # Limit the total number of checkpoints.
    fp16=True,                                # Enable mixed precision training.
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,          # Add the compute_metrics function
    callbacks=[SaveWeightsCallback]           # Add the custom callback here
)

# Train the model
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2506 [00:00<?, ? examples/s]

Map:   0%|          | 0/627 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## DistilBERT

In [None]:
!pip install transformers datasets accelerate evaluate
import pandas as pd
from datasets import Dataset, Features, ClassLabel, Value
from transformers import AutoTokenizer, TrainingArguments, Trainer, DistilBertForSequenceClassification
import numpy as np
import evaluate
import torch

In [None]:
# Concatenate tweet text and labels, remove the index column
train_data = pd.concat([X_train[['tweet text']], y_train.rename('label')], axis=1).reset_index(drop=True)
test_data = pd.concat([X_test[['tweet text']], y_test.rename('label')], axis=1).reset_index(drop=True)

# Define the features including ClassLabel for the label column
features = Features({
    'tweet text': Value(dtype='string', id=None),
    'label': ClassLabel(names=["Trump", "Not Trump"], id=None)
})

# Convert to Hugging Face dataset format with the specified features
train_dataset = Dataset.from_pandas(train_data, features=features)
test_dataset = Dataset.from_pandas(test_data, features=features)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["tweet text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# train_dataset_save_path = "/content/drive/My Drive/distilbert_trump_classifier/tokenized_train_dataset"
# test_dataset_save_path = "/content/drive/My Drive/distilbert_trump_classifier/tokenized_test_dataset"
# tokenized_train_dataset.save_to_disk(train_dataset_save_path)
# tokenized_test_dataset.save_to_disk(test_dataset_save_path)

# Use DistilBertForSequenceClassification with num_labels=2
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels=2)

training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# RUn SaveWeightsCallback CODE BLOCK

## Use Pre-Trained Model

In [None]:
import requests
from transformers import BertForSequenceClassification, BertTokenizerFast
import torch

def download_file_from_google_drive(file_id, destination):
    def get_confirm_token(response):
      for key, value in response.cookies.items():
          if key.startswith('download_warning'):
              return value

      return None

    def save_response_content(response, destination):
      CHUNK_SIZE = 32768

      with open(destination, "wb") as f:
          for chunk in response.iter_content(CHUNK_SIZE):
              if chunk: # filter out keep-alive new chunks
                  f.write(chunk)

    URL = "https://drive.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params={'id': file_id}, stream=True)
    token = get_confirm_token(response)

    if token:
        params = {'id': file_id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)

    save_response_content(response, destination)


def load_model_and_tokenizer(model_url, model_path):
    """
    Load the DistilBERT model and tokenizer from the specified path.

    Args:
      model_url (str): The link for the model itself in google drive
      model_path (str): Path to the directory containing model files.

    Returns:
      model: The loaded DistilBERT model.
      tokenizer: The loaded DistilBERT tokenizer.
    """

    def download_model_files(url, output_path):
      """
      Download and extract model files from a URL.

      Args:
      url (str): URL to the zip file containing model files.
      output_path (str): Path to extract the downloaded zip file.
      """

      # Ensure the output directory exists
      if not os.path.exists(output_path):
          os.makedirs(output_path)

      zip_path = os.path.join(output_path, 'model.zip')

      # Download the zip file
      gdown.download(url, zip_path, quiet=False)

      # Extract the zip file
      with zipfile.ZipFile(zip_path, 'r') as zip_ref:
          zip_ref.extractall(output_path)

      # Remove the zip file
      os.remove(zip_path)

    # Download and extract model files
    download_model_files(model_url, model_path)

    tokenizer = DistilBertTokenizer.from_pretrained(model_path)
    model = DistilBertForSequenceClassification.from_pretrained(model_path)
    return model, tokenizer


def bert_predict(model, tokenizer, df, column_name = 'tweet text', batch_size = 128):
    """
    Predict the labels for the tweet texts in the specified column of the DataFrame in batches.
    Uses the trained model and tokenizer to predict the labels for the given df, based on column name.

    Args:
    model: The DistilBERT model.
    tokenizer: The DistilBERT tokenizer.
    df (pd.DataFrame): The DataFrame containing the tweet texts.
    column_name (str): The name of the column containing tweet texts.
    batch_size (int): The number of samples per batch.

    Returns:
    List[int]: The list of predicted labels.
    """
    predictions = []
    for start in range(0, len(df), batch_size):
        end = min(start + batch_size, len(df))
        batch_texts = df[column_name][start:end].tolist()
        tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

        # Make Predictions
        with torch.no_grad():
            outputs = model(**tokens)
            batch_predictions = torch.argmax(outputs.logits, dim=1)
            predictions.extend(batch_predictions.numpy().tolist())

    return predictions