In [1]:
import re
from datasets import load_dataset
from pathlib import Path

seed = 42

## change the path depending on your system
hub_basepath = Path(r"C:\Users\samba\.cache\huggingface\hub")

## Sentiment Analysis and Question/Answering
Sentiment Analysis from [FPB](https://huggingface.co/datasets/AdaptLLM/FPB)</br>
Question/Answering from [Headline](https://huggingface.co/datasets/AdaptLLM/Headline)</br>

In [None]:
## load_dataset raises an error after download due to tab seperated dataset.
try:
    load_dataset("AdaptLLM/FPB")
except: ...
try:
    load_dataset("AdaptLLM/Headline")
except: ...

## Topic Classification
From huggingface [Sujet-Finance-Instruct-177k](https://huggingface.co/datasets/sujet-ai/Sujet-Finance-Instruct-177k)

In [None]:
dataset_id = "sujet-ai/Sujet-Finance-Instruct-177k"

topics = ['Analyst Update', 'Fed | Central Banks', 'Company | Product News', 'Treasuries | Corporate Debt', 'Dividend', 'Earnings', 'Energy | Oil', 'Financials', 'Currencies', 'General News | Opinion', 'Gold | Metals | Materials', 'IPO', 'Legal | Regulation', 'M&A | Investments', 'Macro', 'Markets', 'Politics', 'Personnel Change', 'Stock Commentary', 'Stock Movement']
topics_label2id = {label: i for i, label in enumerate(topics)}

def topic_class_filter(example):
    return example["task_type"] == "topic_classification"

def topic_mapping(example):
    example["answer"] = topics_label2id[example["answer"]]
    return example

def text_cleaning(example):
    example["user_prompt"] = re.sub(r"https://t.co/.+", "", example["user_prompt"]).strip(" ")
    return example

rmv_cols = ["Unnamed: 0", "inputs", "system_prompt", "task_type", "dataset", "index_level", "conversation_id"]
rnm_cols = {"answer": "label", "user_prompt": "text"}

dataset = (load_dataset(dataset_id, split="train")
            .filter(topic_class_filter)
            .map(topic_mapping)
            .map(text_cleaning)
            .remove_columns(rmv_cols)
            .rename_columns(rnm_cols))
dataset = dataset.train_test_split(test_size=0.05, seed=seed)
dataset

In [None]:
dataset["train"].to_csv(hub_basepath / "datasets--Sujet--TopicClassification" / "train.csv", header=False)
dataset["test"].to_csv(hub_basepath / "datasets--Sujet--TopicClassification" / "test.csv", header=False)