In [3]:
import pandas as pd
import random
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load the CSV file
data = pd.read_csv("trim.csv")  # Replace with your file name
assert "title" in data.columns and "label" in data.columns

# Shuffle the dataset
random.seed(42)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Manually split the dataset
train_data = data.iloc[:3500]
val_data = data.iloc[3500:4000]
test_data = data.iloc[4000:]

# Convert to Hugging Face Dataset
def preprocess_function(examples, tokenizer):
    # Ensure the "title" field is a string
    examples["title"] = [str(title) for title in examples["title"]]
    return tokenizer(examples["title"], truncation=True, padding=True)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = Dataset.from_pandas(train_data).map(lambda x: preprocess_function(x, tokenizer), batched=True)
val_dataset = Dataset.from_pandas(val_data).map(lambda x: preprocess_function(x, tokenizer), batched=True)
test_dataset = Dataset.from_pandas(test_data).map(lambda x: preprocess_function(x, tokenizer), batched=True)

# Define the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=data['label'].nunique())

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = (labels == preds).mean()
    return {"accuracy": acc}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Evaluate on the test set
test_results = trainer.evaluate(test_dataset)
print("Test results:", test_results)

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msaahith[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2758,0.297181,0.9
2,0.1434,0.415332,0.892
3,0.0437,0.551274,0.886


Test results: {'eval_loss': 0.222886323928833, 'eval_accuracy': 0.92, 'eval_runtime': 0.7624, 'eval_samples_per_second': 1311.602, 'eval_steps_per_second': 20.986, 'epoch': 3.0}


In [5]:
# Extract [CLS] representations
def extract_cls_representations(model, dataset, batch_size=32):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    cls_representations = []
    labels = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: batch[key].to(model.device) for key in ["input_ids", "attention_mask"]}
            outputs = model(**inputs, output_hidden_states=True)
            cls_tokens = outputs.hidden_states[-1][:, 0, :]  # Extract [CLS] representation
            cls_representations.append(cls_tokens.cpu().numpy())
            labels.extend(batch["label"].numpy())

    cls_representations = np.concatenate(cls_representations, axis=0)
    return cls_representations, labels


# Reduce dimensions and plot
def plot_pca(embeddings, labels):
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(embeddings)
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=reduced[:, 0], y=reduced[:, 1], hue=labels, palette="tab10", alpha=0.7)
    plt.title("PCA of [CLS] Representations")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.legend(loc="best", title="Label")
    plt.save_fig("BERT_PCA.png")
    plt.show()

    
def plot_tsne(embeddings, labels, perplexity=30):
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    reduced = tsne.fit_transform(embeddings)
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=reduced[:, 0], y=reduced[:, 1], hue=labels, palette="tab10", alpha=0.7)
    plt.title("t-SNE of [CLS] Representations")
    plt.xlabel("t-SNE Dimension 1")
    plt.ylabel("t-SNE Dimension 2")
    plt.legend(loc="best", title="Label")
    plt.save_fig("BERT_t-SNE.png")
    plt.show()

In [4]:
print("Extracting [CLS] representations...")
cls_representations, cls_labels = extract_cls_representations(model, test_dataset)
    
print("Visualizing with PCA...")
plot_pca(cls_representations, cls_labels)
    
print("Visualizing with t-SNE...")
plot_tsne(cls_representations, cls_labels)

Extracting [CLS] representations...


NameError: name 'extract_cls_representations' is not defined