# Combine Datasets

In [1]:
from huggingface_hub import hf_hub_download
import fasttext
from datasets import Dataset, DatasetDict, load_from_disk
import pickle

import sys
sys.path.append('..')

from src.data_processing.utils import is_language

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = hf_hub_download(
    repo_id="facebook/fasttext-language-identification", filename="model.bin"
)
model = fasttext.load_model(model_path)

all_data = {}

def get_ds_and_check_deletion(name):
    try:
        old = load_from_disk(f'../data/FinGPT/{name}')
    except:
        old = load_from_disk(f'../data/preprocessed/{name}')
    ds = load_from_disk(f'../data/reformatted/{name}')

    for key in ds.keys():
        perc_deleted = 1 - ds[key].num_rows/old[key].num_rows
        print(key, 'deleted:', perc_deleted)

    return ds

def return_dutch_training_samples(df, cols, model):
    is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
    return df[is_dutch]

def filter_non_dutch(ds, cols):

    new_ds = DatasetDict()
    
    for key in ds.keys():
        df = ds[key].to_pandas()
        df = return_dutch_training_samples(df, cols, model)
        new_ds[key] = Dataset.from_pandas(df, preserve_index=False)
    
    return new_ds




## FINRED

In [34]:
name = 'fingpt-finred'

finred = get_ds_and_check_deletion(name)

train deleted: 0.009362072719355519
test deleted: 0.0017605633802817433


In [58]:
_finred = filter_non_dutch(finred, ['input','instruction'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['finred'] = _finred

## FINRED-RE

In [8]:
name = 'fingpt-finred-re'

finred_re = get_ds_and_check_deletion(name)

train deleted: 0.004298245614035134
test deleted: 0.000936329588014928


In [33]:
_finred_re = filter_non_dutch(finred_re, ['input','instruction'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['finred_re'] = _finred_re

## FINRED-CLS

In [27]:
name = 'fingpt-finred-cls'

finred_cls = get_ds_and_check_deletion(name)

train deleted: 0.07915583611833144
test deleted: 0.06530017921146958


In [30]:
_finred_cls = filter_non_dutch(finred_cls, ['input','instruction'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['finred_cls'] = _finred_cls

## HEADLINE

In [24]:
name = 'fingpt-headline'

headline = get_ds_and_check_deletion(name)

train deleted: 0.0006450749138885437
test deleted: 0.00048668905436322074


In [16]:
_headline = filter_non_dutch(headline, ['input','instruction'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['headline'] = _headline

## HEADLINE-CLS

In [18]:
name = 'fingpt-headline-cls'

headline_cls = get_ds_and_check_deletion(name)

train deleted: 0.029746473387617045
test deleted: 0.02900666764004478


In [23]:
_headline_cls = filter_non_dutch(headline_cls, ['input','instruction'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['headline_cls'] = _headline_cls

## NER

In [40]:
name = 'fingpt-ner'

ner = get_ds_and_check_deletion(name)

train deleted: 0.136986301369863
test deleted: 0.2857142857142857


In [41]:
_ner = filter_non_dutch(ner, ['input','instruction'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['ner'] = _ner

## NER-CLS

In [46]:
name = 'fingpt-ner-cls'

ner_cls = get_ds_and_check_deletion(name)

train deleted: 0.15956897187984354
test deleted: 0.24271844660194175


In [49]:
_ner_cls = filter_non_dutch(ner_cls, ['input','instruction'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['ner_cls'] = _ner_cls

## SENTIMENT-CLS

In [50]:
name = 'fingpt-sentiment-cls'

sentiment_cls = get_ds_and_check_deletion(name)

train deleted: 0.0009252055428222716


In [53]:
_sentiment_cls = filter_non_dutch(sentiment_cls, ['input','instruction'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['sentiment_cls'] = _sentiment_cls

## SENTIMENT-TRAIN

In [54]:
name = 'fingpt-sentiment-train'

sentiment_train = get_ds_and_check_deletion(name)

train deleted: 0.00045589537852341877


In [58]:
_sentiment_train = filter_non_dutch(sentiment_train, ['input','instruction'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['sentiment_train'] = _sentiment_train

## FIQA-QA

In [64]:
name = 'fingpt-fiqa_qa'

fiqa_qa = get_ds_and_check_deletion(name)

train deleted: 0.007247223845704309


In [65]:
_fiqa_qa = filter_non_dutch(fiqa_qa, ['input','instruction', 'output'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['fiqa_qa'] = _fiqa_qa

## CONVFINQA

In [69]:
name = 'fingpt-convfinqa'

convfinqa = get_ds_and_check_deletion(name)

train deleted: 0.029989193083573507
test deleted: 0.024161073825503365


In [70]:
_convfinqa = filter_non_dutch(convfinqa, ['input','instruction'])

  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)
  is_duch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['convfinqa'] = _convfinqa

## ALPACA-WITH-INPUT


In [74]:
name = 'finance-alpaca-with-input'

alpaca_with_input = get_ds_and_check_deletion(name)

train deleted: 0.04541768045417682


In [78]:
_alpaca_with_input = filter_non_dutch(alpaca_with_input, ['input','instruction', 'output'])

  is_dutch = df[cols].applymap(lambda x: is_language(x, model, 'nld')).all(axis = 1)


In [None]:
all_data['alpaca_with_input'] = _alpaca_with_input

## ALPACA-WITHOUT-INPUT

In [80]:
name = 'finance-alpaca-without-input'

alpaca_without_input = get_ds_and_check_deletion(name)

train deleted: 0.01854261548471048


In [None]:
_alpaca_without_input = filter_non_dutch(alpaca_without_input, ['instruction', 'output'])

In [None]:
all_data['alpaca_without_input'] = _alpaca_without_input

## Save The Dataset

In [None]:
with open('../data/all_filtered_data.pickle', 'wb') as output:
    pickle.dump(all_data, output)