In [None]:
# Ensure CUDA is enabled in Colab
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Install the necessary libraries
!pip install transformers torch scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, GPT2ForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch

In [None]:
# Load dataset
df = pd.read_csv('mytextdata.csv')

In [None]:
# Map the string labels to integers
label_to_id = {label: idx for idx, label in enumerate(df['Label'].unique())}
df['labels'] = df['Label'].map(label_to_id)

In [None]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2)

In [None]:
# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # Change this to "gpt2-medium" or "gpt2-large" for better performance
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_id)).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
from datasets import Dataset
from transformers import GPT2Tokenizer

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the pad_token to be the same as eos_token
tokenizer.pad_token = tokenizer.eos_token

# Define the tokenize function
def tokenize_function(examples):
    return tokenizer(examples['Text'], padding='max_length', truncation=True, max_length=512)

# Convert pandas DataFrame to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_df[['Text', 'labels']])
val_dataset = Dataset.from_pandas(val_df[['Text', 'labels']])

# Tokenize the datasets using map
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# The 'labels' column should already be in integer form, so you can directly use them in training
train_dataset = train_dataset.rename_column('labels', 'label')
val_dataset = val_dataset.rename_column('labels', 'label')


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

In [None]:
!pip install --upgrade transformers



In [None]:
from transformers import GPT2Tokenizer, Trainer, TrainingArguments, GPT2ForSequenceClassification
from sklearn.metrics import classification_report
import torch

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the padding token to be the same as the eos_token
tokenizer.pad_token = tokenizer.eos_token

# Load the model configuration
model_config = GPT2ForSequenceClassification.from_pretrained("gpt2").config

# Set the pad_token_id in the model configuration
model_config.pad_token_id = tokenizer.pad_token_id

# Set num_labels in the model configuration
model_config.num_labels = len(label_to_id)

# Define the model (GPT-2 for sequence classification) with the updated configuration
model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=model_config)

# Move model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the evaluation metric
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    return classification_report(p.label_ids, preds, output_dict=True)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=8,     # Reduce batch size to 8
    per_device_eval_batch_size=32,
    num_train_epochs=10,        # total use epochs = 9
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    gradient_accumulation_steps=4,     # Accumulate gradients over 4 steps
    fp16=True,                         # Enable mixed precision training
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,   # Compute classification report during evaluation
    tokenizer=tokenizer
)

# Train the model
trainer.train()



Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msajeeb_ai[0m ([33msajeeb_ai-brac-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,1.8188
200,1.2569
300,1.0442
400,0.906


TrainOutput(global_step=430, training_loss=1.2265492505805438, metrics={'train_runtime': 584.7063, 'train_samples_per_second': 23.944, 'train_steps_per_second': 0.735, 'total_flos': 3576726804234240.0, 'train_loss': 1.2265492505805438, 'epoch': 9.777142857142858})

In [None]:
# Manually evaluate the model on the validation dataset after training
results = trainer.evaluate()

Trainer is attempting to log a value of "{'precision': 0.7346938775510204, 'recall': 0.4931506849315068, 'f1-score': 0.5901639344262295, 'support': 73.0}" of type <class 'dict'> for key "eval/0" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7619047619047619, 'recall': 0.64, 'f1-score': 0.6956521739130435, 'support': 75.0}" of type <class 'dict'> for key "eval/1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.5301204819277109, 'recall': 0.6875, 'f1-score': 0.5986394557823129, 'support': 64.0}" of type <class 'dict'> for key "eval/2" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8392857142857143, 'recall': 0.6811594202898551, 'f1-score

In [None]:
# Print the evaluation results
print("Evaluation results:")
print(results)

Evaluation results:
{'eval_loss': 0.7634968757629395, 'eval_0': {'precision': 0.7346938775510204, 'recall': 0.4931506849315068, 'f1-score': 0.5901639344262295, 'support': 73.0}, 'eval_1': {'precision': 0.7619047619047619, 'recall': 0.64, 'f1-score': 0.6956521739130435, 'support': 75.0}, 'eval_2': {'precision': 0.5301204819277109, 'recall': 0.6875, 'f1-score': 0.5986394557823129, 'support': 64.0}, 'eval_3': {'precision': 0.8392857142857143, 'recall': 0.6811594202898551, 'f1-score': 0.752, 'support': 69.0}, 'eval_4': {'precision': 0.6262626262626263, 'recall': 0.8985507246376812, 'f1-score': 0.7380952380952381, 'support': 69.0}, 'eval_accuracy': 0.6771428571428572, 'eval_macro avg': {'precision': 0.6984534923863668, 'recall': 0.6800721659718085, 'f1-score': 0.6749101604433647, 'support': 350.0}, 'eval_weighted avg': {'precision': 0.7023601615579732, 'recall': 0.6771428571428572, 'f1-score': 0.6753867910149498, 'support': 350.0}, 'eval_runtime': 4.4218, 'eval_samples_per_second': 79.153, 

In [None]:
# Ensure that the ground truth labels are integers
val_df['Label'] = val_df['Label'].map(label_to_id)

# Generate predictions on the validation dataset
predictions = trainer.predict(val_dataset)

# Extract the predicted class labels
predicted_labels = predictions.predictions.argmax(axis=1)

# Print classification report for the validation set
print("Classification report on the validation set:")
print(classification_report(val_df['Label'], predicted_labels, target_names=label_to_id.keys()))

Classification report on the validation set:
                   precision    recall  f1-score   support

    agreeableness       0.73      0.49      0.59        73
      neuroticism       0.76      0.64      0.70        75
         openness       0.53      0.69      0.60        64
     extroversion       0.84      0.68      0.75        69
conscientiousness       0.63      0.90      0.74        69

         accuracy                           0.68       350
        macro avg       0.70      0.68      0.67       350
     weighted avg       0.70      0.68      0.68       350

