In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)

# Our chosen gpt2 model
BASE_MODEL = "gpt2"



In [None]:
ds_name = "StephanAkkerman/crypto-stock-tweets"
dataset = load_dataset(ds_name, split="train[:30%]")  # Reduced dataset size
dataset  = dataset.remove_columns("url")


In [None]:
from datasets import Dataset
import re
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer

def clean_tweet(tweet):
    """Clean crypto tweets while preserving key crypto symbols"""
    # Remove hashtags but keep $SYMBOLS
    tweet = re.sub(r'#(\w+)', r'\1', tweet)  # Remove # but keep the word
    
    # Remove URLs
    tweet = re.sub(r'https?://\S+', '', tweet)
    
    # Clean special characters but preserve crypto mentions
    tweet = re.sub(r'[^\w\s$%@.,!?&/-]', '', tweet)
    
    # Normalize whitespace
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    
    return tweet

def format_for_generation(examples):
    """Add prompt prefix and structure tweets for generation"""
    return {
        "text": [f"Cryptocurrency Tweet: {clean_tweet(txt)}\n" 
                for txt in examples["text"]]
    }
    
dataset = dataset.map(format_for_generation, batched=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)

tokenizer.pad_token = tokenizer.eos_token  # GPT-2 models don’t have a pad token, use eos_token instead

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=96,  # Reduced sequence length
        padding="max_length",
        return_tensors="pt"
    )
    
    
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000
)


In [None]:
# train_size = int(0.8 * len(tokenized_dataset))
# train_dataset = tokenized_dataset.select(range(train_size))
# eval_dataset  = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

# print("Train size:", len(train_dataset))
# print("Eval size: ", len(eval_dataset))

In [None]:
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer, 
#     mlm=False  # No masked language modeling for causal models
# )

In [None]:
training_args = TrainingArguments(
    output_dir="crypto_gpt2",
    num_train_epochs=1,
    per_device_train_batch_size=16,  # Increased batch size
    gradient_accumulation_steps=2,
    learning_rate=3e-4,
    fp16=True,  # Keep FP16 enabled
    gradient_checkpointing=False,  # Disabled to avoid error
    optim="adafactor",  # Memory-efficient optimizer
    logging_steps=50,
    save_total_limit=1,
    max_steps=2000,  # Hard limit steps
    report_to="none"
)


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8  # Better GPU utilization
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

In [None]:
trainer.train()

# Save the final model
trainer.save_model("tinylama-1.1b-crypto-fullft-checkpoint", safe_serialization=False)
print("Fine-tuning complete. Model saved.")

In [None]:
trainer.save_model("tinylama-1.1b-crypto-fullft-checkpoint")


In [None]:
model = AutoModelForCausalLM.from_pretrained("tinylama-1.1b-crypto-fullft-checkpoint")
prompt = "tell me something about bitcoin"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        temperature=0.7,
        top_p=0.9
    )

print("Generated text:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))