In [None]:
import numpy as np
import torch
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
import evaluate


In [None]:
ds_name = "CryptoLM/BTC-USDT"
raw_train = load_dataset(ds_name, split="train[:2%]")  # First 5% of data
raw_test  = load_dataset(ds_name, split="train[2%:3%]")  # Next 1% of data


print(raw_train)
print(raw_test)

WINDOW_SIZE = 3


In [None]:
def create_sequences(examples, window_size=3):
    """
    For each example in 'examples', build sequences of length 'window_size'.
    Each sequence is a textual encoding of the numeric features from t-window_size+1 to t.
    The label is open_{t+1}.
    """
    # We'll access arrays of each column
    opens = examples["open"]
    highs = examples["high"]
    lows  = examples["low"]
    closes = examples["close"]
    
    # You might want to include more features like volume, RSI, etc. 
    # Just retrieve them from examples[...] similarly.
    
    # The result lists:
    sequences = []
    labels = []
    
    for i in range(len(opens) - window_size):
        # Build the input from [i, i+1, ..., i+window_size-1]
        seq_texts = []
        for w in range(window_size):
            idx = i + w
            seq_texts.append(
                f"(t-{window_size - w - 1}): "
                f"open={opens[idx]}, high={highs[idx]}, low={lows[idx]}, close={closes[idx]}"
            )
        # Combine all lines into one long text
        combined_text = " | ".join(seq_texts)
        
        # The label: open_{(i + window_size)} 
        # i + window_size is the day after the last day in the window
        # We treat that as float
        next_open = float(opens[i + window_size])
        
        sequences.append(combined_text)
        labels.append(next_open)
        
    return {"input_text": sequences, "label": labels}


In [None]:
train_slid = raw_train.map(
    create_sequences,
    batched=True,
    fn_kwargs={"window_size": WINDOW_SIZE},
    remove_columns=raw_train.column_names
)
test_slid = raw_test.map(
    create_sequences,
    batched=True,
    fn_kwargs={"window_size": WINDOW_SIZE},
    remove_columns=raw_test.column_names
)

print(train_slid[0]["input_text"])
print(train_slid[0]["label"])


In [None]:
model_name = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Sometimes you need to define a pad token explicitly for certain models
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(examples):
    return tokenizer(
        examples["input_text"],
        truncation=True,
        padding="max_length", 
        max_length=128  # you might adjust based on window size
    )

train_tokenized = train_slid.map(tokenize_fn, batched=True)
test_tokenized  = test_slid.map(tokenize_fn,  batched=True)


In [None]:
def rename_label_col(example):
    return {"labels": example["label"]}

train_tokenized = train_tokenized.map(rename_label_col, remove_columns=["label", "input_text"])
test_tokenized  = test_tokenized.map(rename_label_col, remove_columns=["label", "input_text"])

train_tokenized.set_format("torch")
test_tokenized.set_format("torch")


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1  # crucial for regression
)


In [None]:
mse_metric = evaluate.load("mse")
mae_metric = evaluate.load("mae")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions shape: (batch_size, 1), so flatten
    predictions = predictions.flatten()
    labels = labels.flatten()
    
    # MSE returns a dict, e.g. {"mean_squared_error": 0.5}
    mse_result = mse_metric.compute(predictions=predictions, references=labels)
    # Extract the float
    mse_value = mse_result["mse"]
    
    # MAE returns a dict, e.g. {"mean_absolute_error": 0.3}
    mae_result = mae_metric.compute(predictions=predictions, references=labels)
    mae_value = mae_result["mae"]
    
    # Compute RMSE
    rmse_value = mse_value ** 0.5
    
    return {
        "mse": mse_value,
        "rmse": rmse_value,
        "mae": mae_value,
    }


In [None]:
training_args = TrainingArguments(
    output_dir="./finetuned-btc-regression",
    overwrite_output_dir=True,
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,  # We'll pick the best by default on val_loss
    metric_for_best_model="mse",  # or "rmse"
    greater_is_better=False,
    # GPU usage
    fp16=torch.cuda.is_available(),
    report_to="none"  # or "wandb"/"tensorboard"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
# Suppose we want to predict after the last window in the test set
sample = test_tokenized[-1]

# The Trainer expects a batch, so replicate or wrap in list
inputs = {k: torch.tensor(sample[k]).unsqueeze(0).to(model.device) 
          for k in ["input_ids", "attention_mask"]}

with torch.no_grad():
    outputs = model(**inputs)
prediction = outputs.logits.item()  # shape [1,1]
print("Predicted next-day open:", prediction)

actual = test_slid[-1]["label"]
print("Actual next-day open:", actual)




