<a href="https://colab.research.google.com/github/prrmzz/NLP-US-Election-RoBERT/blob/main/US_ElectionRoBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets pandas scikit-learn torch



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

In [None]:
train_path = "/content/drive/My Drive/2024 U.S. Election Sentiment on X/train.csv"
val_path = "/content/drive/My Drive/2024 U.S. Election Sentiment on X/val.csv"
test_path = "/content/drive/My Drive/2024 U.S. Election Sentiment on X/test.csv"

In [None]:
train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)
test_data = pd.read_csv(test_path)
print(train_data.head())

   tweet_id     user_handle            timestamp  \
0         1        @user123  2024-11-03 08:45:00   
1         2    @politicsFan  2024-11-03 09:15:23   
2         3  @greenAdvocate  2024-11-03 10:05:45   
3         4     @indieVoice  2024-11-03 11:20:10   
4         5   @libertyLover  2024-11-03 12:35:55   

                                          tweet_text       candidate  \
0  Excited to see Kamala Harris leading the Democ...   Kamala Harris   
1  Donald Trump's policies are the best for our e...    Donald Trump   
2  Jill Stein's environmental plans are exactly w...      Jill Stein   
3  Robert Kennedy offers a fresh perspective outs...  Robert Kennedy   
4  Chase Oliver's libertarian stance promotes tru...    Chase Oliver   

               party  retweets  likes sentiment  
0   Democratic Party       120    450  positive  
1   Republican Party        85    300  positive  
2        Green Party        60    200  positive  
3        Independent        40    150   neutral  
4  L

In [None]:
train_data = train_data.dropna(subset=["sentiment"])
val_data = val_data.dropna(subset=["sentiment"])
test_data = test_data.dropna(subset=["sentiment"])

In [None]:
print(train_data["sentiment"].isnull().sum())
print(train_data["sentiment"].unique())

0
['positive' 'neutral' 'negative' 'positive  ']


In [None]:
train_data["sentiment"] = train_data["sentiment"].str.strip()
print(train_data["sentiment"].unique())

['positive' 'neutral' 'negative']


In [None]:
label_map = {"positive": 2, "neutral": 1, "negative": 0}
train_data["label"] = train_data["sentiment"].map(label_map).astype(int)
val_data["label"] = val_data["sentiment"].map(label_map).astype(int)
test_data["label"] = test_data["sentiment"].map(label_map).astype(int)

In [None]:
from datasets import Dataset
from transformers import RobertaTokenizer

In [None]:
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def preprocess_function(examples):
    return tokenizer(examples["tweet_text"], truncation=True, padding="max_length", max_length=128)

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer

In [None]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none"
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.415,0.217497
2,0.0966,0.030196
3,0.0553,0.010973


TrainOutput(global_step=96, training_loss=0.2883988929291566, metrics={'train_runtime': 2465.7996, 'train_samples_per_second': 0.608, 'train_steps_per_second': 0.039, 'total_flos': 98667531648000.0, 'train_loss': 0.2883988929291566, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(test_dataset)

In [None]:
predicted_labels = predictions.predictions.argmax(axis=1)

In [None]:
label_map_reverse = {2: "positive", 1: "neutral", 0: "negative"}
predicted_sentiments = [label_map_reverse[label] for label in predicted_labels]

In [None]:
test_data["predicted_sentiment"] = predicted_sentiments

In [None]:
from sklearn.metrics import accuracy_score, classification_report

true_labels = test_data["sentiment"]
predicted_labels = test_data["predicted_sentiment"]
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(true_labels, predicted_labels, target_names=["negative", "neutral", "positive"]))

Accuracy: 1.0000
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         3
     neutral       1.00      1.00      1.00        13
    positive       1.00      1.00      1.00        34

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50



In [None]:
from transformers import pipeline

In [None]:
model.save_pretrained("/content/drive/My Drive/fine-tuned-roberta-sentiment")
tokenizer.save_pretrained("/content/drive/My Drive/fine-tuned-roberta-sentiment")

In [None]:
sentiment_pipeline = pipeline("text-classification", model="/content/drive/My Drive/fine-tuned-roberta-sentiment")

Device set to use cpu


In [None]:
tweet = "Kamala Harris's new policy on education reform is impressive."
result = sentiment_pipeline(tweet)
print(result)

[{'label': 'LABEL_2', 'score': 0.996626615524292}]
