In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# Load and preprocess dataset
data = pd.read_csv('datasetofsenti.csv')
data = data.drop(columns=["Unnamed: 0"], errors="ignore")

# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["text"], data["label"], test_size=0.375, random_state=42
)

# Tokenizer and preprocessing
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Load GPT-2 model for sequence classification
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=len(data["label"].unique()))
model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=1,
    run_name="gpt2_sentiment_analysis",
)

# Metrics computation
def compute_metrics(pred):
    logits, labels = pred
    logits = torch.tensor(logits).to(device)
    labels = torch.tensor(labels).to(device)
    predictions = torch.argmax(logits, axis=-1)
    acc = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
    precision, recall, f1, _ = precision_recall_fscore_support(labels.cpu().numpy(), predictions.cpu().numpy(), average="weighted")
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train and evaluate the model
trainer.train()
results = trainer.evaluate()
print("Evaluation Results:", results)

# Compute accuracy on the test set
def compute_accuracy_on_test():
    test_preds = trainer.predict(test_dataset)
    logits = test_preds.predictions
    logits_tensor = torch.tensor(logits)
    predictions = torch.argmax(logits_tensor, axis=-1)
    acc = accuracy_score(test_labels.to_numpy(), predictions.cpu().numpy())
    print(f"Accuracy on the Test Set: {acc * 100:.2f}%")

compute_accuracy_on_test()


Map:   0%|          | 0/260505 [00:00<?, ? examples/s]

Map:   0%|          | 0/156304 [00:00<?, ? examples/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0476,0.096901,0.940443,0.940233,0.943205,0.940443
2,0.0857,0.093824,0.939317,0.940191,0.945189,0.939317
3,0.0447,0.092556,0.937225,0.937845,0.939277,0.937225


Evaluation Results: {'eval_loss': 0.09690078347921371, 'eval_accuracy': 0.940442982905108, 'eval_f1': 0.9402328886228506, 'eval_precision': 0.9432051619854357, 'eval_recall': 0.940442982905108, 'eval_runtime': 309.1661, 'eval_samples_per_second': 505.566, 'eval_steps_per_second': 31.598, 'epoch': 3.0}


KeyboardInterrupt: 