<a href="https://colab.research.google.com/github/radve88/Learning-AI/blob/main/code_finetuning%20gpt4%20mini%20finance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    sentiment = torch.argmax(logits, dim=1).item()
    sentiment_label = ["Positive", "Negative", "Neutral"][sentiment]
    return sentiment_label

In [None]:
# Load the compressed dataset from file
open compressed_file as file:
    # Read the contents of the file into memory
    data = read_file(file)

# Extract relevant sections of each document
for each document in data:
    extract document_id
    extract date
    extract main_text_content

# Define a function to clean and segment text content
function clean_and_segment_text(text):
    # Remove unwanted characters and whitespace
    cleaned_text = remove_special_characters(text)
    cleaned_text = standardize_whitespace(cleaned_text)

    # Split the cleaned text into sentences or text segments
    sentences = split_into_sentences(cleaned_text)

    return sentences

# Apply the cleaning and segmentation function to each document’s content
for each document in data:
    sentences = clean_and_segment_text(document['main_text_content'])
    save sentences to structured format

# Create a structured data storage for individual sentences
initialize empty list of structured_data

for each sentence in sentences:
    # Append sentence to structured data
    structured_data.append(sentence)

# Define a function to filter out unwanted sentences based on specific criteria
function filter_sentences(sentence):
    if sentence is too short:
        return False
    if sentence contains specific patterns (e.g., dates or excessive symbols):
        return False
    if sentence matches unwanted formatting characteristics:
        return False

    return True

# Apply the filter to structured data
filtered_data = [sentence for sentence in structured_data if filter_sentences(sentence)]

# Further filter the sentences based on minimum length or other criteria
final_data = [sentence for sentence in filtered_data if meets_minimum_length(sentence)]

# Save the final data structure for model training
save final_data as structured_file

In [None]:
df_sampled = df.sample(n=1000000, random_state=42).reset_index(drop=True)

In [None]:
import json

jsonl_data = []
for _, row in tqdm(df_sampled.iterrows(), total=df_sampled.shape[0]):
    content = row['sentence']
    sentiment = get_sentiment(content)

    jsonl_entry = {
        "messages": [
            {"role": "system", "content": "The assistant is a financial expert."},
            {"role": "user", "content": content},
            {"role": "assistant", "content": sentiment}
        ]
    }
    jsonl_data.append(jsonl_entry)

with open('finetuning_data.jsonl', 'w') as jsonl_file:
    for entry in jsonl_data:
        jsonl_file.write(json.dumps(entry) + '\n')

In [None]:
with open('finetuning_data.jsonl', 'r') as jsonl_file:
    data = [json.loads(line) for line in jsonl_file]

for entry in data:
    entry["messages"][2]["content"] = entry["messages"][2]["content"].lower()

with open('finetuning_data_lowercase.jsonl', 'w') as new_jsonl_file:
    for entry in data:
        new_jsonl_file.write(json.dumps(entry) + '\n')

In [None]:
import random
random.seed(42)

random.shuffle(data)

split_ratio = 0.8
split_index = int(len(data) * split_ratio)

training_data = data[:split_index]
validation_data = data[split_index:]

with open('training_data.jsonl', 'w') as train_file:
    for entry in training_data:
        train_file.write(json.dumps(entry) + '\n')

with open('validation_data.jsonl', 'w') as val_file:
    for entry in validation_data:
        val_file.write(json.dumps(entry) + '\n')


In [None]:
from sklearn.model_selection import train_test_split

data_df = pd.DataFrame({
    'content': [entry["messages"][1]["content"] for entry in data],
    'label': [entry["messages"][2]["content"] for entry in data]
})

df_sampled, _ = train_test_split(data_df, stratify=data_df['label'], test_size=0.9, random_state=42)
train_df, val_df = train_test_split(df_sampled, stratify=df_sampled['label'], test_size=0.2, random_state=42)

def df_to_jsonl(df, filename):
    jsonl_data = []
    for _, row in df.iterrows():
        jsonl_entry = {
            "messages": [
                {"role": "system", "content": "The assistant is a financial expert."},
                {"role": "user", "content": row['content']},
                {"role": "assistant", "content": row['label']}
            ]
        }
        jsonl_data.append(jsonl_entry)

    with open(filename, 'w') as jsonl_file:
        for entry in jsonl_data:
            jsonl_file.write(json.dumps(entry) + '\n')

df_to_jsonl(train_df, 'reduced_training_data.jsonl')
df_to_jsonl(val_df, 'reduced_validation_data.jsonl')

Step8: Fine-Tune GPT-4o Mini Using OpenAI’s Fine-Tuning API
With your prepared JSONL files, follow OpenAI’s documentation to fine-tune GPT-4o mini on the prepared training and validation datasets.
Upload Data and Start Fine-Tuning: Upload the JSONL files to OpenAI’s platform and follow their API instructions to initiate the fine-tuning process.
OpenAI Finetuning Dashboard: Financial Sentiment Analysis