In [1]:
!pip install transformers dataset accelerate torch

Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.54-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading dataset-1.6.2-py2.py3-none-any.whl (18 kB)
Downloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Downloading SQLAlchemy-1.4.54-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: banal, sqlalchemy, dataset
  Attempting uninstall: sqlalchemy
    Found existing installation: SQLAlchemy 2.0.46
    Uninstalling SQLAlchemy-2.0.46:
      Successfully uninstalled SQLAlchemy-2.0.46
[31mERROR: pi

In [2]:
import torch
from datasets import Dataset

from transformers import (
    GPT2Config, AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer, Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)

In [3]:
pretrain_text = [
    "The stock market opened higher today due to strong tech earnings.",
    "Investors are cautious about the upcoming inflation report.",
    "Tech stocks rallied significantly in the afternoon session.",
    "The central bank decided to keep interest rates unchanged.",
    "Market volatility has increased over the last quarter.",
    "Analysts predict a bullish trend for the semiconductor sector.",
    "The economy is showing signs of recovery after the recession.",
    "Shareholders voted to approve the merger between the two giants."
] * 10 # repeat to give more steps

finetune_data = [
    {"text": "Stocks rallied significantly.", "label": 1},
    {"text": "The market crashed due to panic.", "label": 0},
    {"text": "Strong earnings reports boosted confidence.", "label": 1},
    {"text": "Inflation fears caused a massive sell-off.", "label": 0},
    {"text": "The bullish trend continues.", "label": 1},
    {"text": "Investors lost money in the downturn.", "label": 0},
] # 1 as +ve and 0 as -ve

In [14]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

config = GPT2Config(
    vocab_size = len(tokenizer),
    n_positions = 128,
    n_ctx = 128,
    n_embed = 256,
    n_layer = 2,
    n_head = 4
)

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

ds_pretrain = Dataset.from_dict({'text':pretrain_text})
def tokenize_func(examples):
  return tokenizer(examples['text'], truncation= True, padding= 'max_length', max_length= 32)

tokenizer_pretrain = ds_pretrain.map(tokenize_func, batched = True)

pretrained_args = TrainingArguments(
    output_dir = 'pretrained_model',
    num_train_epochs = 5,
    per_device_train_batch_size = 4,
    save_steps = 10,
    logging_steps = 10,
    seed = 42
)

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

trainer = Trainer(
    model = model,
    args = pretrained_args,
    data_collator = data_collator,
    train_dataset = tokenizer_pretrain
)

trainer.train()
trainer.save_model('pretrained_model')

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: distilgpt2
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
transformer.h.{0, 1, 2, 3, 4, 5}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Step,Training Loss
10,3.055583
20,1.717654
30,0.942224
40,0.639717
50,0.452905
60,0.441437
70,0.310822
80,0.332163
90,0.226595
100,0.169436


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
model_ft = AutoModelForSequenceClassification.from_pretrained('pretrained_model', num_labels = 2)
model_ft.config.pad_token_id = tokenizer.pad_token_id

ds_finetune = Dataset.from_list(finetune_data)
tokenized_finetune = ds_finetune.map(tokenize_func, batched = True)

ft_args = TrainingArguments(
    output_dir = 'finetuned_model',
    num_train_epochs = 5,
    per_device_train_batch_size = 2,
    logging_steps = 5
)

trainer_ft = Trainer(
    model = model_ft,
    args = ft_args,
    train_dataset = tokenized_finetune
)

trainer_ft.train()

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

GPT2ForSequenceClassification LOAD REPORT from: pretrained_model
Key          | Status  | 
-------------+---------+-
score.weight | MISSING | 

Notes:
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Step,Training Loss
5,1.420349
10,0.862681
15,0.23843


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=15, training_loss=0.8404865105946858, metrics={'train_runtime': 57.3566, 'train_samples_per_second': 0.523, 'train_steps_per_second': 0.262, 'total_flos': 244974551040.0, 'train_loss': 0.8404865105946858, 'epoch': 5.0})

In [16]:
test_text = 'The market is showing strong growth.'
test_input = tokenizer(test_text, return_tensors = 'pt').to(model_ft.device)

with torch.no_grad():
  logits = model_ft(**test_input).logits
  predicted_class = torch.argmax(logits, dim=1).item()

print(f'input: {test_text}')
print(f"sentiment: {'positive' if predicted_class == 1 else 'negative'}")

input: The market is showing strong growth.
sentiment: positive
