# Combine Datasets

In [94]:
from huggingface_hub import hf_hub_download
import fasttext
from datasets import Dataset, DatasetDict, load_from_disk
import pandas as pd

import os 
import sys
sys.path.append('..')

from src.data_processing.utils import is_language

In [3]:
model_path = hf_hub_download(
    repo_id="facebook/fasttext-language-identification", filename="model.bin"
)
model = fasttext.load_model(model_path)

all_data = {}

def get_ds_and_check_deletion(name):
    try:
        old = load_from_disk(f'../data/FinGPT/{name}')
    except:
        old = load_from_disk(f'../data/preprocessed/{name}')
    ds = load_from_disk(f'../data/reformatted/{name}')

    for key in ds.keys():
        perc_deleted = 1 - ds[key].num_rows/old[key].num_rows
        print(key, 'deleted:', perc_deleted)

    return ds

def return_dutch_training_samples(df, cols, model):
    is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
    return df[is_dutch]

def filter_non_dutch(ds, cols):

    new_ds = DatasetDict()
    
    for key in ds.keys():
        df = ds[key].to_pandas()
        df = return_dutch_training_samples(df, cols, model)
        new_ds[key] = Dataset.from_pandas(df, preserve_index=False)
    
    return new_ds

def translate_sentence(sentence: str, translations: dict) -> str:
    for eng, dutch in translations.items():
        if eng in sentence:
            sentence = sentence.replace(eng, dutch)
    return sentence



## sentiment

In [49]:
# notes
# translations are being done again because a lot of the instructions were not correct so come back here if done
# OK

In [56]:
name = 'fingpt-sentiment'

sentiment = get_ds_and_check_deletion(name)

train deleted: 0.011983535664044198
test deleted: 0.009341075485988393


In [57]:
translations = {
    'neutral' : 'neutraal',             
    'mild positief' : 'matig positief',      
    'mild negatief' : 'matig negatief', 
    'positive' : 'positief',   
    'mildly positive' : 'matig positief', 
    'negative' : 'negatief',
    'strong positive' : 'sterk positief',
    'mildly negative' : 'matig negatief',
    'mildly negatief' : 'matig negatief',
    'strong positief' : 'sterk positief',
    'mildly positief' : 'matig positief',
}

train = sentiment['train'].to_pandas()
test = sentiment['test'].to_pandas()

train['output'] = train.output.apply(lambda x : translate_sentence(x,translations))
test['output'] = test.output.apply(lambda x : translate_sentence(x,translations))

sentiment = DatasetDict({
        'train': Dataset.from_pandas(train, preserve_index=False),
        'test': Dataset.from_pandas(test, preserve_index=False)
    })

In [59]:
_sentiment = filter_non_dutch(sentiment, ['input','instruction'])

  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [62]:
sentiment.save_to_disk(f'../data/final_unfiltered/{name}')


Saving the dataset (1/1 shards): 100%|██████████| 75852/75852 [00:00<00:00, 2276539.16 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 7848/7848 [00:00<00:00, 1390189.11 examples/s]


In [63]:
_sentiment.save_to_disk(f'../data/final_filtered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 52781/52781 [00:00<00:00, 2240547.73 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5788/5788 [00:00<00:00, 1246489.61 examples/s]


## finred

In [38]:
# notes
# the outputs need to be translated. general translations are not good
# OK

In [26]:
name = 'fingpt-finred'

finred = get_ds_and_check_deletion(name)

train deleted: 0.009325785615792115
test deleted: 0.0013693270735524177


In [33]:
_finred = filter_non_dutch(finred, ['instruction', 'input'])

  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [37]:
finred.save_to_disk(f'../data/final_unfiltered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 27301/27301 [00:00<00:00, 1161027.85 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5105/5105 [00:00<00:00, 862583.97 examples/s]


In [38]:
_finred.save_to_disk(f'../data/final_filtered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 9641/9641 [00:00<00:00, 957571.45 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 1790/1790 [00:00<00:00, 523410.78 examples/s]


## ner

In [None]:
# notes
# no filtering is done otherwise half of the samples is deleted
# OK 

In [116]:
name = 'fingpt-ner'

ner = get_ds_and_check_deletion(name)

train deleted: 0.1154598825831703
test deleted: 0.22448979591836737


In [117]:
_ner = filter_non_dutch(ner, ['output'])

  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [118]:
ner.save_to_disk(f'../data/final_unfiltered/{name}')


Saving the dataset (1/1 shards): 100%|██████████| 452/452 [00:00<00:00, 144741.59 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 76/76 [00:00<00:00, 31322.31 examples/s]


In [119]:
_ner.save_to_disk(f'../data/final_filtered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 406/406 [00:00<00:00, 142560.69 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 68/68 [00:00<00:00, 30813.82 examples/s]


## ner-cls

In [None]:
# notes
# OK

In [133]:
name = 'fingpt-ner-cls'

ner_cls = get_ds_and_check_deletion(name)

train deleted: 0.15956897187984354
test deleted: 0.24271844660194175


In [134]:
_ner_cls = filter_non_dutch(ner_cls, ['input','instruction'])

  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [135]:
ner_cls.save_to_disk(f'../data/final_unfiltered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 11387/11387 [00:00<00:00, 1303650.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2652/2652 [00:00<00:00, 697254.07 examples/s]


In [136]:
_ner_cls.save_to_disk(f'../data/final_filtered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 9071/9071 [00:00<00:00, 1113709.14 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1959/1959 [00:00<00:00, 602084.09 examples/s]


## headline

In [61]:
# notes
# translations done
# OK

In [84]:
name = 'fingpt-headline'

temp = get_ds_and_check_deletion(name)

train deleted: 0.0006450749138885437
test deleted: 0.00048668905436322074


In [85]:
corrections = {
'Gaat de krantenkop over prijs? Kies alstublieft een antwoord uit {Ja/Nee}' : 'Gaat de krantenkop over de prijs? Kies alstublieft een antwoord uit {Ja/Nee}',    
'Gaat de krantenkop over prijs staying constant': "Gaat de krantenkop over de prijs die constant blijft",
'Gaat de krantenkop over prijs going down': "Gaat de krantenkop over de prijs die omlaag gaat",
'Gaat de krantenkop over prijs going up': "Gaat de krantenkop over de prijs die omhoog gaat",
'Gaat de krantenkop over prijs in het verleden': "Gaat de krantenkop over de prijs in het verleden",
'Gaat de krantenkop over prijs in de toekomst' : "Gaat de krantenkop over de prijs in de toekomst",
'Gaat de krantenkop over prijs in de future' : "Gaat de krantenkop over de prijs in de toekomst",
'Gaat de krantenkop over prijs in the past' :  "Gaat de krantenkop over de prijs in het verleden",
'Gaat de krantenkop over prijs constant blijven' : 'Gaat de krantenkop over de prijs die constant blijft',
'Yes' : 'Ja',
'No' : 'Nee'
}

train = temp['train'].to_pandas()
test = temp['test'].to_pandas()

train['instruction'] = train.instruction.apply(lambda x : translate_sentence(x,corrections))
test['instruction'] = test.instruction.apply(lambda x : translate_sentence(x,corrections))

train['output'] = train.output.apply(lambda x : translate_sentence(x,corrections))
test['output'] = test.output.apply(lambda x : translate_sentence(x,corrections))


headline = DatasetDict({
        'train': Dataset.from_pandas(train, preserve_index=False),
        'test': Dataset.from_pandas(test, preserve_index=False)
    })

In [86]:
_headline = filter_non_dutch(headline, ['input'])

  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [89]:
headline.save_to_disk(f'../data/final_unfiltered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 82108/82108 [00:00<00:00, 2544316.15 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 20537/20537 [00:00<00:00, 2176971.83 examples/s]


In [90]:
_headline.save_to_disk(f'../data/final_filtered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 36751/36751 [00:00<00:00, 1836719.73 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9094/9094 [00:00<00:00, 1327082.34 examples/s]


## alpaca with input

In [None]:
# notes
# OK

In [8]:
name = 'finance-alpaca-with-input'

alapaca_input_temp = get_ds_and_check_deletion(name)

alpaca_with_input = DatasetDict()

for key in alapaca_input_temp.keys():
    df = alapaca_input_temp[key].to_pandas()

    df = df.drop(columns = ['text'])

    alpaca_with_input[key] = Dataset.from_pandas(df, preserve_index=False)


train deleted: 0.04541768045417682


In [10]:
_alpaca_with_input = filter_non_dutch(alpaca_with_input, ['input','instruction', 'output'])

  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [13]:
alpaca_with_input.save_to_disk(f'../data/final_unfiltered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 18832/18832 [00:00<00:00, 1515805.96 examples/s]


In [14]:
_alpaca_with_input.save_to_disk(f'../data/final_filtered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 11118/11118 [00:00<00:00, 1395298.52 examples/s]


## alpaca without input

In [15]:
name = 'finance-alpaca-without-input'

alapaca_without_input_temp = get_ds_and_check_deletion(name)

alpaca_without_input = DatasetDict()

for key in alapaca_without_input_temp.keys():
    df = alapaca_without_input_temp[key].to_pandas()

    df = df.drop(columns = ['text'])

    alpaca_without_input[key] = Dataset.from_pandas(df, preserve_index=False)

train deleted: 0.01854261548471048


In [18]:
_alpaca_without_input = filter_non_dutch(alpaca_without_input, ['instruction', 'output'])

  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [22]:
alpaca_without_input.save_to_disk(f'../data/final_unfiltered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 48272/48272 [00:00<00:00, 1376243.53 examples/s]


In [23]:
_alpaca_without_input.save_to_disk(f'../data/final_filtered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 32473/32473 [00:00<00:00, 1413642.56 examples/s]


## convfinqa

In [39]:
name = 'fingpt-convfinqa'

convfinqa = get_ds_and_check_deletion(name)

train deleted: 0.029989193083573507
test deleted: 0.024161073825503365


In [44]:
_convfinqa = filter_non_dutch(convfinqa, ['input','instruction'])

  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [48]:
convfinqa.save_to_disk(f'../data/final_unfiltered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 10771/10771 [00:00<00:00, 306200.68 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1454/1454 [00:00<00:00, 215731.65 examples/s]


In [49]:
_convfinqa.save_to_disk(f'../data/final_filtered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 10713/10713 [00:00<00:00, 290523.84 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1453/1453 [00:00<00:00, 212323.58 examples/s]


## fiqa-qa

In [75]:
name = 'fingpt-fiqa_qa'

fiqa_qa_temp = get_ds_and_check_deletion(name)

train deleted: 0.007247223845704309


In [76]:
fiqa_qa = fiqa_qa_temp['train'].train_test_split(test_size=0.15, seed=42)

In [78]:
_fiqa_qa = filter_non_dutch(fiqa_qa, ['input','instruction', 'output'])

  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [82]:
fiqa_qa.save_to_disk(f'../data/final_unfiltered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 14438/14438 [00:00<00:00, 194020.10 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2548/2548 [00:00<00:00, 176217.89 examples/s]


In [84]:
_fiqa_qa.save_to_disk(f'../data/final_filtered/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 12310/12310 [00:00<00:00, 795144.03 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2187/2187 [00:00<00:00, 496989.91 examples/s]


# Combine The Datasets

## filtered

In [110]:
path_to_data = '/home/sandernoels/fingeit/data/final_filtered'

frames = []

for folder_name in os.listdir(path_to_data):
    if os.path.isdir(os.path.join(path_to_data, folder_name)): 
        temp = load_from_disk(f'../data/final_filtered/{folder_name}')['train'].to_pandas()
        temp['origin']  = folder_name
        frames.append(temp)

In [119]:
temp = pd.concat(frames)
final_filtered = temp[~temp.duplicated()].reset_index(drop = True)

In [121]:
name = 'filtered_instruction_tuning_dataset'

filtered_it_ds = DatasetDict()
filtered_it_ds['train'] = Dataset.from_pandas(final_filtered, preserve_index=False)
filtered_it_ds.save_to_disk(f'../data/final/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 147788/147788 [00:00<00:00, 1270723.46 examples/s]


## unfiltered

In [125]:
path_to_data = '/home/sandernoels/fingeit/data/final_unfiltered'

frames = []

for folder_name in os.listdir(path_to_data):
    if os.path.isdir(os.path.join(path_to_data, folder_name)): 
        temp = load_from_disk(f'../data/final_unfiltered/{folder_name}')['train'].to_pandas()
        temp['origin']  = folder_name
        frames.append(temp)

In [126]:
temp = pd.concat(frames)
final_unfiltered = temp[~temp.duplicated()].reset_index(drop = True)

In [127]:
name = 'unfiltered_instruction_tuning_dataset'

unfiltered_it_ds = DatasetDict()
unfiltered_it_ds['train'] = Dataset.from_pandas(final_unfiltered, preserve_index=False)
unfiltered_it_ds.save_to_disk(f'../data/final/{name}')

Saving the dataset (1/1 shards): 100%|██████████| 246883/246883 [00:00<00:00, 1426268.56 examples/s]
