In [49]:
# Install the below packages
# !pip install transformers datasets peft

In [50]:
# Load the Dataset from hugging face

from datasets import load_dataset, DatasetDict
dataset = load_dataset("PaulAdversarial/all_news_finance_sm_1h2023")
df = dataset['train'].to_pandas()

In [4]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')
nltk.download('punkt')


# Preprocessing function to clean the text
def preprocess_text(text):
    """
    Preprocesses a given text by removing numbers, punctuation, converting to lowercase,
    and removing stopwords.
    Args:
        text (str): The input text to preprocess.
    Returns:
        str: The preprocessed text with numbers, punctuation, and stopwords removed,
             and all words converted to lowercase.
    Steps:
        1. Removes all digits from the text using regular expressions.
        2. Removes punctuation, keeping only alphanumeric characters and spaces.
        3. Converts the text to lowercase.
        4. Tokenizes the text into individual words.
        5. Removes stopwords from the tokenized words.
        6. Joins the filtered tokens back into a single string.
    """
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)


# Apply preprocessing to the text column
df['cleaned_text'] = df['title'].apply(preprocess_text)

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=3)
# Fit and transform the cleaned text data
tfidf_matrix = tfidf.fit_transform(df['cleaned_text'])
# Get the feature names (keywords)
keywords = tfidf.get_feature_names_out()
# Extract top keywords for each row
df['key_topics'] = [', '.join([keywords[i] for i in tfidf_matrix[row].indices]) for row in range(tfidf_matrix.shape[0])]
# shrinked df with necessary columns
df_new = df[['title', 'key_topics', 'description']]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
df_without_null = df_new[df["key_topics"] != ""].reset_index(drop = True)

In [6]:
# creating the df back to format which training script accepts

from datasets import Dataset, DatasetDict
import pandas as pd

dataset_changed = Dataset.from_pandas(df_without_null)
dataset_changed_dict = DatasetDict({
    'train': dataset_changed
})

# Split dataset into train and test (90% train, 10% test)
dataset_split = dataset_changed_dict['train'].train_test_split(test_size=0.1)
dataset_dict = DatasetDict({
    'train': dataset_split['train'],
    'test': dataset_split['test']
})

In [7]:
import torch
# Step 1: Check if GPU is available and set device to CUDA or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [32]:
# Loading the Flan-T5 large model, which will serve as the base for further fine-tuning.

from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM

# Load the Flan-T5 base model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
original_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large").to(device)

In [33]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"



In [34]:
print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 783150080
all model parameters: 783150080
percentage of trainable model parameters: 100.00%


In [35]:
def preprocess_function(examples):
    """
    Preprocesses a batch of examples for input into a model, formatting the input and target
    text, and tokenizing them.

    Args:
        examples (dict): A dictionary containing the input and target text. The keys are:
            - 'key_topics': List of input prompts to generate context from.
            - 'description': List of target descriptions (labels) to train the model.

    Returns:
        dict: A dictionary containing tokenized inputs and labels for the model. The keys are:
            - 'input_ids' (list of int): Tokenized inputs.
            - 'labels' (list of int): Tokenized target descriptions.
    """
    inputs = examples['key_topics']
    targets = examples['description']
    inputs = [f"Generate financial context: {input_text}" for input_text in inputs]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [36]:
tokenized_datasaet = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/1764 [00:00<?, ? examples/s]



Map:   0%|          | 0/196 [00:00<?, ? examples/s]

In [51]:
# Setting up the hyper-parameter combination for LoRA

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

peft_model = get_peft_model(original_model,
                            lora_config)

In [52]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 9437184
all model parameters: 792587264
percentage of trainable model parameters: 1.19%


In [39]:
## Training args

import time
from transformers import TrainingArguments, Trainer
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=100,
    logging_steps=1,
    max_steps=10
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasaet["train"],
)

max_steps is given, it will override any value given in num_train_epochs


In [40]:
peft_trainer.train()

Step,Training Loss
1,41.299
2,42.4925
3,37.5426
4,32.0858
5,37.1534
6,18.9648
7,28.2825
8,29.5216
9,19.8718
10,25.7145


TrainOutput(global_step=10, training_loss=31.292854690551756, metrics={'train_runtime': 6.1561, 'train_samples_per_second': 1.624, 'train_steps_per_second': 1.624, 'total_flos': 5834397450240.0, 'train_loss': 31.292854690551756, 'epoch': 0.005668934240362812})

### Save your model

In [41]:
peft_model_path="./peft-results-local_1"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./peft-results-local_1/tokenizer_config.json',
 './peft-results-local_1/special_tokens_map.json',
 './peft-results-local_1/spiece.model',
 './peft-results-local_1/added_tokens.json',
 './peft-results-local_1/tokenizer.json')

### Loading base model and taking the fine tuned model using peft (1.19 percent were only trainable params)

In [42]:
from peft import PeftModel, PeftConfig

# peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load the Flan-T5 base model and tokenizer
# model_name = "google/flan-t5-large"
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# peft_model_base = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large").to(device)

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       '/content/peft-results-local_1',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [43]:
def generate_text(model, input_text, tokenizer, max_length=512):
    """
    Generates text based on a given input prompt using a pre-trained language model and tokenizer.

    Args:
        model (transformers.PreTrainedModel): The pre-trained language model to generate text with.
        input_text (str): The input prompt text to generate context from.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to preprocess the input text for the model.
        max_length : The maximum length of the generated text. Defaults to 512.
    Returns:
        str: The generated text, decoded from the model's output without special tokens.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    inputs = tokenizer(f"Generate financial context: {input_text}", return_tensors="pt").to(device)
    output_sequences = model.generate(input_ids=inputs['input_ids'], max_length=max_length)
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    return generated_text


## Sample Output tesing

In [44]:
input_text = "BTC price down"
# input_text = "bitcoin"

# Generate text from the base model
base_output = generate_text(original_model, input_text, tokenizer)
print("Base Model Output:\n", base_output)

# Generate text from the fine-tuned model
fine_tuned_output = generate_text(peft_model, input_text, tokenizer)
print("\nFine-Tuned Model Output:\n", fine_tuned_output)

Base Model Output:
 BTC price down

Fine-Tuned Model Output:
 Bitcoin price down on Monday, after a strong rally on the exchange.


In [45]:
input_text = "oil"

# Generate text from the base model
base_output = generate_text(original_model, input_text, tokenizer)
print("Base Model Output:\n", base_output)

# Generate text from the fine-tuned model
fine_tuned_output = generate_text(peft_model, input_text, tokenizer)
print("\nFine-Tuned Model Output:\n", fine_tuned_output)

Base Model Output:
 During the first half of the year, the oil and gas sector saw a net increase in production, averaging a total of 1.2 million barrels per day.

Fine-Tuned Model Output:
 Oil prices have been falling for the past several years, but the recent drop has been attributed to the continuing weakness of the global economy.


In [46]:
input_text = "bitcoin"

# Generate text from the base model
base_output = generate_text(original_model, input_text, tokenizer)
print("Base Model Output:\n", base_output)

# Generate text from the fine-tuned model
fine_tuned_output = generate_text(peft_model, input_text, tokenizer)
print("\nFine-Tuned Model Output:\n", fine_tuned_output)

Base Model Output:
 Bitcoin is a crypto-currency that was created in 2014 by the Bitcoin network.

Fine-Tuned Model Output:
 Bitcoin is a crypto currency that was created in 2009 by a group of hackers who hacked into the Bitcoin network.


In [47]:
input_text = "price"

# Generate text from the base model
base_output = generate_text(original_model, input_text, tokenizer)
print("Base Model Output:\n", base_output)

# Generate text from the fine-tuned model
fine_tuned_output = generate_text(peft_model, input_text, tokenizer)
print("\nFine-Tuned Model Output:\n", fine_tuned_output)

Base Model Output:
 price of oil is the price of the oil.

Fine-Tuned Model Output:
 During the first half of the year, the company reported a net profit of $2.2 billion, compared to a net loss of $2.2 billion in the first half of the year.


In [48]:
input_text = "inflation"

# Generate text from the base model
base_output = generate_text(original_model, input_text, tokenizer)
print("Base Model Output:\n", base_output)

# Generate text from the fine-tuned model
fine_tuned_output = generate_text(peft_model, input_text, tokenizer)
print("\nFine-Tuned Model Output:\n", fine_tuned_output)

Base Model Output:
 The ECB has lowered its monetary policy to a 2% to 2% rate.

Fine-Tuned Model Output:
 Inflation is the increase in the price of goods and services.
