# Twitter Sentiment Analysis on Datasets for Low Resources Languages - Nigerian Languages Case Study with Pre-trained Models

Download the train and test dataset located at and place them in the same directory as this Notebook:
https://github.com/hausanlp/NaijaSenti/tree/main/data/annotated_tweets

For example, for Yoruba language:
  Train dataset:https://github.com/hausanlp/NaijaSenti/blob/main/data/annotated_tweets/yor/train.tsv
  Test dataset: https://github.com/hausanlp/NaijaSenti/blob/main/data/annotated_tweets/yor/test.tsv

In [None]:
!pip install accelerate -U
!pip install transformers torch sklearn

## Setup and Load Data

In [6]:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd

def load_data(file_path):
    return pd.read_csv(file_path, delimiter='\t')

train_df = load_data('train.tsv')
test_df = load_data('test.tsv')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Model: camembert-base

In [None]:

# Tokenization and Model Setup for camembert-base
tokenizer = AutoTokenizer.from_pretrained('camembert-base')
model = AutoModelForSequenceClassification.from_pretrained('camembert-base', num_labels=2)

# Prepare dataset
train_encodings = tokenizer(train_df['tweet'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df['tweet'].tolist(), truncation=True, padding=True, max_length=128)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()))

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results_camembert-base',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_camembert-base',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train and Evaluate
trainer.train()


## Model: bert-base-cased

In [None]:

# Tokenization and Model Setup for bert-base-cased
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)

# Prepare dataset
train_encodings = tokenizer(train_df['tweet'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df['tweet'].tolist(), truncation=True, padding=True, max_length=128)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()))

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results_bert-base-cased',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_bert-base-cased',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train and Evaluate
trainer.train()


## Model: xlnet-base-cased

In [None]:

# Tokenization and Model Setup for xlnet-base-cased
tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)

# Prepare dataset
train_encodings = tokenizer(train_df['tweet'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df['tweet'].tolist(), truncation=True, padding=True, max_length=128)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()))

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results_xlnet-base-cased',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_xlnet-base-cased',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train and Evaluate
trainer.train()


## Model: albert-base-v2

In [None]:

# Tokenization and Model Setup for albert-base-v2
tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')
model = AutoModelForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

# Prepare dataset
train_encodings = tokenizer(train_df['tweet'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df['tweet'].tolist(), truncation=True, padding=True, max_length=128)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()))

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results_albert-base-v2',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_albert-base-v2',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train and Evaluate
trainer.train()


## Model: distilroberta-base

In [None]:

# Tokenization and Model Setup for distilroberta-base
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=2)

# Prepare dataset
train_encodings = tokenizer(train_df['tweet'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df['tweet'].tolist(), truncation=True, padding=True, max_length=128)

train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()))
test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_df['label'].apply(lambda x: 1 if x == 'positive' else 0).tolist()))

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results_distilroberta-base',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_distilroberta-base',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train and Evaluate
trainer.train()
