In [1]:
!pip install -U pyarrow --quiet
!pip install datasets transformers torch numpy seqeval --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.
ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.

In [2]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.model_selection import train_test_split
from seqeval.metrics import accuracy_score

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define compute metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Generate synthetic financial data
def generate_financial_data(num_samples=1000):
    np.random.seed(42)
    revenues = np.random.randint(100000, 10000000, num_samples)
    expenses = np.random.randint(50000, 9000000, num_samples)
    profits = revenues - expenses

    data = []
    labels = []

    for i in range(num_samples):
        financial_text = f"Revenue: ${revenues[i]}, Expenses: ${expenses[i]}, Profit: ${profits[i]}"

        if profits[i] > 1000000:
            interpretation = "The company is performing exceptionally well with high profits."
            label = 2
        elif profits[i] > 0:
            interpretation = "The company is profitable but there's room for improvement."
            label = 1
        else:
            interpretation = "The company is operating at a loss and needs immediate attention."
            label = 0

        data.append(financial_text + " " + interpretation)
        labels.append(label)

    return data, labels

# Create a custom dataset
class FinancialDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Generate synthetic data
texts, labels = generate_financial_data()

# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Create datasets
train_dataset = FinancialDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = FinancialDataset(val_texts, val_labels, tokenizer, max_length=128)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_steps=100,
    save_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

eval_results = trainer.evaluate()
print(f"Before Training Evaluation results: {eval_results}")

# Train the model
trainer.train()

eval_results = trainer.evaluate()
print(f"Post Training Evaluation results: {eval_results}")

# Function to interpret new financial data
def interpret_financial_data(financial_text):
    encoding = tokenizer.encode_plus(
        financial_text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)

    interpretations = [
        "The company is operating at a loss and needs immediate attention.",
        "The company is profitable but there's room for improvement.",
        "The company is performing exceptionally well with high profits."
    ]

    return interpretations[predicted.item()]

# Example usage
new_financial_data = "Revenue: $8500000, Expenses: $7000000, Profit: $1500000"
interpretation = interpret_financial_data(new_financial_data)
print(f"Financial Data: {new_financial_data}")
print(f"Interpretation: {interpretation}")

Using device: cpu


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Before Training Evaluation results: {'eval_loss': 1.070624589920044, 'eval_model_preparation_time': 0.004, 'eval_accuracy': 0.445, 'eval_runtime': 95.5818, 'eval_samples_per_second': 2.092, 'eval_steps_per_second': 0.042}


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,0.7453,0.594538,0.004,0.92
2,0.0948,0.052117,0.004,1.0
3,0.0089,0.00593,0.004,1.0


Post Training Evaluation results: {'eval_loss': 0.0059300134889781475, 'eval_model_preparation_time': 0.004, 'eval_accuracy': 1.0, 'eval_runtime': 81.2901, 'eval_samples_per_second': 2.46, 'eval_steps_per_second': 0.049, 'epoch': 3.0}
Financial Data: Revenue: $8500000, Expenses: $7000000, Profit: $1500000
Interpretation: The company is performing exceptionally well with high profits.
