In [57]:
import pandas as pd
import random
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load the CSV file
data = pd.read_csv("trim.csv")  # Replace with your file name
assert "title" in data.columns and "label" in data.columns

# Shuffle the dataset
random.seed(42)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Manually split the dataset
train_data = data.iloc[:3500]
val_data = data.iloc[3500:4000]
test_data = data.iloc[4000:]

# Convert to Hugging Face Dataset
def preprocess_function(examples, tokenizer):
    # Ensure the "title" field is a string
    examples["title"] = [str(title) for title in examples["title"]]
    return tokenizer(examples["title"], truncation=True, padding=True)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = Dataset.from_pandas(train_data).map(lambda x: preprocess_function(x, tokenizer), batched=True)
val_dataset = Dataset.from_pandas(val_data).map(lambda x: preprocess_function(x, tokenizer), batched=True)
test_dataset = Dataset.from_pandas(test_data).map(lambda x: preprocess_function(x, tokenizer), batched=True)

# Define the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=data['label'].nunique())

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = (labels == preds).mean()
    return {"accuracy": acc}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Evaluate on the test set
test_results = trainer.evaluate(test_dataset)
print("Test results:", test_results)

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Count the distribution of labels in train_data
train_label_distribution = train_data['label'].value_counts()

# Count the distribution of labels in val_data
val_label_distribution = val_data['label'].value_counts()

# Count the distribution of labels in val_data
test_label_distribution = test_data['label'].value_counts()


# Display the distributions
print("Train Data Label Distribution:")
print(train_label_distribution)

print("\nValidation Data Label Distribution:")
print(val_label_distribution)



print("\Test Data Label Distribution:")
print(test_label_distribution)


In [None]:
for data in test_data['title']:
    print(type(data))

In [None]:
import torch
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from transformers import BertModel

In [None]:
# Function to extract embeddings for the test dataset
def extract_embeddings():
    embeddings = []
    labels = []

    # BERT requires inputs in specific formats
    for sample in test_dataset:
        
        
        inputs = tokenizer(sample['text'], return_tensors="pt", truncation=True, padding="max_length", max_length=512).to("cuda")
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
        # Extract the embedding of the classification token ([CLS])
        # print(len(outputs['hidden_states']))
        cls_embedding = outputs['hidden_states'][-1][:, 0, :].squeeze().to('cpu').numpy()
        # cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
        labels.append(int(sample["label"]))  # Store labels for visualization
    
    return embeddings, labels

In [None]:
# Extract embeddings and labels from the test dataset
test_embeddings, test_labels = extract_embeddings()

# Reduce dimensionality using PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(test_embeddings)

# Ensure test_labels has two distinct classes
assert len(set(test_labels)) == 2, "Test labels must have exactly two classes for this visualization."

# Assign colors based on the two classes
class_colors = {0: 'red', 1: 'blue'}  # Adjust these colors as needed
point_colors = [class_colors[label] for label in test_labels]

# Plot PCA visualization
plt.figure(figsize=(10, 7))
scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=point_colors, alpha=0.7)
handles = [plt.Line2D([0], [0], marker='o', color='w', label=f'Class {cls} {" - Fake" if cls == 1 else "- Real"}', markersize=10, markerfacecolor=col)
           for cls, col in class_colors.items()]
plt.legend(handles=handles, title="Classes")
plt.title("PCA Visualization of [CLS] Token Embeddings")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.savefig("PCA_Bert_Viz_Classes.png")
plt.show()

In [None]:
import numpy as np

# Ensure test_labels has two distinct classes
assert len(set(test_labels)) == 2, "Test labels must have exactly two classes for this visualization."

# Reduce dimensionality using t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
tsne_result = tsne.fit_transform(np.array(test_embeddings))

# Assign colors based on the two classes
class_colors = {0: 'red', 1: 'blue'}  # Adjust these colors as needed
point_colors = [class_colors[label] for label in test_labels]

# Plot t-SNE visualization
plt.figure(figsize=(10, 7))
scatter = plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=point_colors, alpha=0.7)
handles = [plt.Line2D([0], [0], marker='o', color='w', label=f'Class {cls} {" - Fake" if cls == 1 else "- Real"}', markersize=10, markerfacecolor=col)
           for cls, col in class_colors.items()]
plt.legend(handles=handles, title="Classes")
plt.title("t-SNE Visualization of [CLS] Token Embeddings")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.savefig("t-SNE_BERT_Viz_Classes.png")
plt.show()
