In [28]:
import datasets
from datasets import load_dataset
import pickle

# Sentiment Datasets - Test

### 1. FPB

In [3]:
dic = {
    0:"negative",
    1:'neutral',
    2:'positive',
}

In [11]:
fpb_datasets = load_dataset("financial_phrasebank", "sentences_50agree")
fpb_datasets = fpb_datasets["train"]
fpb_datasets = fpb_datasets.to_pandas()
fpb_datasets.columns = ["input", "output"]
fpb_datasets["output"] = fpb_datasets["output"].apply(lambda x:dic[x])
fpb_datasets["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
fpb_datasets = datasets.Dataset.from_pandas(fpb_datasets)
fpb_datasets = fpb_datasets.train_test_split(seed=42)['test']

In [12]:
fpb_datasets

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 1212
})

### 2. FiQA SA

In [14]:
def make_label(x):
    if x < - 0.1:
        return "negative"
    elif -0.1 <= x < 0.1:
        return "neutral"
    else:
        return "positive"

def add_instructions(x):
    if x == "post":
        return "What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}."
    else:
        return "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."

In [17]:
dataset = load_dataset('pauri32/fiqa-2018')
# dataset = load_from_disk('../data/fiqa-2018/')
dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ])
dataset = dataset.to_pandas()
dataset["output"] = dataset.sentiment_score.apply(make_label)
dataset["instruction"] = dataset.format.apply(add_instructions)
dataset = dataset[['sentence', 'output', "instruction"]]
dataset.columns = ["input", "output", "instruction"]
dataset = datasets.Dataset.from_pandas(dataset)
fiqa_sa_dataset = dataset.train_test_split(0.226, seed=42)['test']
fiqa_sa_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 275
})

### 3. TFNS

In [18]:
dic = {
    0:"negative",
    1:'positive',
    2:'neutral',
}

In [25]:
social_media_dataset = load_dataset('zeroshot/twitter-financial-news-sentiment')
social_media_dataset = social_media_dataset['validation']
social_media_dataset = social_media_dataset.to_pandas()
social_media_dataset['label'] = social_media_dataset['label'].apply(lambda x:dic[x])
social_media_dataset['instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'
social_media_dataset.columns = ['input', 'output', 'instruction']
social_media_dataset = datasets.Dataset.from_pandas(social_media_dataset)
social_media_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 2388
})

### 4. NWGI

In [26]:
finance_dataset = load_dataset('oliverwang15/news_with_gpt_instructions')
finance_dataset = finance_dataset['test'].to_pandas()
finance_dataset['output'] = finance_dataset['label']
finance_dataset["input"] = finance_dataset["news"]
finance_dataset["instruction"] = 'What is the sentiment of this news? Please choose an answer from {strong negative/moderately negative/mildly negative/neutral/mildly positive/moderately positive/strong positive}.'
finance_dataset = finance_dataset[['input', 'output', 'instruction']]
finance_dataset = datasets.Dataset.from_pandas(finance_dataset)
finance_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 4047
})

In [27]:
all_sentiment_test_data = {}

all_sentiment_test_data['fpb'] = fpb_datasets
all_sentiment_test_data['fiqa-sa'] = fiqa_sa_dataset
all_sentiment_test_data['tfns'] = social_media_dataset
all_sentiment_test_data['nwgi'] = finance_dataset

In [29]:
with open('../data/all_sentiment_test_data.pickle', 'wb') as handle:
    pickle.dump(all_sentiment_test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)