In [15]:
!pip install transformers datasets scikit-learn -q

import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
from google.colab import files


In [16]:
# Run this cell to upload files
from google.colab import files

# Upload training.csv, testing.csv
uploaded = files.upload()


Saving training.csv to training (1).csv
Saving testing.csv to testing (1).csv


In [17]:
import zipfile
import os

import pandas as pd

train_df = pd.read_csv("training.csv")
test_df = pd.read_csv("testing.csv")

# Preview
train_df.head()


print("\n📝 Training Set Sample:")
print(train_df.head())


📝 Training Set Sample:
  file name     label                                   Original English  \
0   100.jpg  positive  This is my Valentine's from 1 of my nephews. I...   
1  1001.jpg  positive          Zoe's first love #Rattled @JohnnyHarper15   
2  1002.jpg  positive  Chaotic Love - giclee print ?65 at #art #love ...   
3  1009.jpg  positive  We are so #excited to announce that we have la...   
4  1018.jpg  positive  Found this cool photo, not mine My ? brothers ...   

                Phrase (EN)                  Translated (ES)  \
0                 Valentine                        Valentine   
1                first love                      primer amor   
2              giclee print              Impresión de giclee   
3  affiliate program please  programa de afiliados por favor   
4                   mine My                          Mia Mia   

                                     Spanglish Tweet  
0  This is my Valentine's from 1 of my nephews. I...  
1         Zoe's primer a

In [18]:
# --- 4. Prepare for BERT (Text Only) ---
# Only keep Spanglish Tweet and label
train_df = train_df[['Spanglish Tweet', 'label']].rename(columns={"Spanglish Tweet": "text"})
test_df = test_df[['Spanglish Tweet', 'label']].rename(columns={"Spanglish Tweet": "text"})

# Encode labels
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
train_df['label'] = train_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

In [19]:
# --- 5. Tokenize ---
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/3181 [00:00<?, ? examples/s]

Map:   0%|          | 0/801 [00:00<?, ? examples/s]

In [20]:
# --- 6. Load BERT ---
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
# --- 7. Training ---

!pip install --upgrade transformers -q
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics

)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7413,0.80199,0.661673


In [None]:
preds = trainer.predict(test_dataset)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred, target_names=label_mapping.keys()))

# --- 9. Confusion Matrix ---
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(label_mapping.keys()))
disp.plot(cmap="Blues", xticks_rotation=45)
plt.title("Confusion Matrix")
plt.show()
