#### TODO>:
- get data from kaggle try some processing techniques then tokenizer

In [1]:
import os; 
import torch
import zipfile
import numpy as np
import pandas as pd

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [2]:
zip_file_path = "..\data\FinancialPhraseBank-v1.0.zip"
extract_dir = "../data/"
base_path = '../data/FinancialPhraseBank-v1.0'
corpus_path = "../data/news_corpus.txt"
tokenizer_path = "../models/finbpe_tokenizer.json"

In [3]:
def extract_zipfile(zip_file_path, extract_dir):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_content = zip_ref.namelist()
        print(f"File inside the zip file: {zip_content}")
        zip_ref.extractall(extract_dir)
    print(f"File extracted at path: {extract_dir}")

# -------------------------------------------------------------------------------------------------------
def handle_data(file_paths: list, to_save: bool = True):
    content = {'news': [], 'sentiment': []}   
    for file_path in file_paths:
        print(f"Handling file: {file_path}")
        try:
            with open(file_path, 'r') as file:data = file.read()
            data_mid = [h.split("@") for h in data.split('\n') if '@' in h]
            for news, sentiment in data_mid:
                content['news'].append(news);content['sentiment'].append(sentiment)
            print(f"{file_path} handled successfully")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    if to_save:
        to_return = pd.DataFrame(content)
        to_return.to_csv(to_save)
        print(f"Data saved as {to_save}")
    else:
        to_return = content
        print("Data processed, not saved as DataFrame")
    return to_return

#----------------------------------------------------------------------------------------------------------
def reencode_file_to_utf8(filepath: str):
    with open(filepath, 'r', encoding='ISO-8859-1') as f:content = f.read()
    with open(filepath, 'w', encoding='utf-8') as f:f.write(content)
    print(f"File re-encoded to UTF-8 and saved at: {filepath}")

# ---------------------------------------------------------------------------------------------------------
def save_corpus(data: pd.DataFrame, filepath: str) -> None:
    # Ensure the directory exists
    directory = os.path.dirname(filepath)
    if not os.path.exists(directory):os.makedirs(directory)
    text = ' '.join(data['news'].tolist())
    with open(filepath, 'w') as cnt:cnt.write(text)
    print(f"News Corpus saved at: {filepath}")
    reencode_file_to_utf8(filepath)

#-----------------------------------------------------------------------------------------------------------
def get_tokenizer(train:bool=True, data_path:list=None, vocab_size:int=None, tokenizer_path:str="models/finbpe_tokenizer.json", special_tokens:list=None):
    if train:
        special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]"] if special_tokens is None else special_tokens
        tokenizer = Tokenizer(BPE()); tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=2, special_tokens=special_tokens)
        tokenizer.train(data_path); tokenizer.save(tokenizer_path)
        print(f"Tokenizer saved at: {tokenizer_path}")
    else:
        tokenizer = Tokenizer.from_file(path=tokenizer_path)
        print(f"Tokenizer loaded from {tokenizer_path}")
    return tokenizer

In [4]:
extract_zipfile(zip_file_path, extract_dir)
file_paths = os.listdir(extract_dir + 'FinancialPhraseBank-v1.0')[2:]
full_paths = [os.path.join(base_path, file) for file in file_paths]; full_paths

File inside the zip file: ['FinancialPhraseBank-v1.0/', 'FinancialPhraseBank-v1.0/License.txt', '__MACOSX/', '__MACOSX/FinancialPhraseBank-v1.0/', '__MACOSX/FinancialPhraseBank-v1.0/._License.txt', 'FinancialPhraseBank-v1.0/README.txt', '__MACOSX/FinancialPhraseBank-v1.0/._README.txt', 'FinancialPhraseBank-v1.0/Sentences_50Agree.txt', 'FinancialPhraseBank-v1.0/Sentences_66Agree.txt', 'FinancialPhraseBank-v1.0/Sentences_75Agree.txt', 'FinancialPhraseBank-v1.0/Sentences_AllAgree.txt']
File extracted at path: ../data/


['../data/FinancialPhraseBank-v1.0\\Sentences_50Agree.txt',
 '../data/FinancialPhraseBank-v1.0\\Sentences_66Agree.txt',
 '../data/FinancialPhraseBank-v1.0\\Sentences_75Agree.txt',
 '../data/FinancialPhraseBank-v1.0\\Sentences_AllAgree.txt']

In [5]:
data = handle_data(file_paths=full_paths, to_save=extract_dir+"finnews.csv")

Handling file: ../data/FinancialPhraseBank-v1.0\Sentences_50Agree.txt
../data/FinancialPhraseBank-v1.0\Sentences_50Agree.txt handled successfully
Handling file: ../data/FinancialPhraseBank-v1.0\Sentences_66Agree.txt
../data/FinancialPhraseBank-v1.0\Sentences_66Agree.txt handled successfully
Handling file: ../data/FinancialPhraseBank-v1.0\Sentences_75Agree.txt
../data/FinancialPhraseBank-v1.0\Sentences_75Agree.txt handled successfully
Handling file: ../data/FinancialPhraseBank-v1.0\Sentences_AllAgree.txt
../data/FinancialPhraseBank-v1.0\Sentences_AllAgree.txt handled successfully
Data saved as ../data/finnews.csv


In [6]:
save_corpus(data, corpus_path)

News Corpus saved at: ../data/news_corpus.txt
File re-encoded to UTF-8 and saved at: ../data/news_corpus.txt


In [7]:
tokenizer = get_tokenizer(train=True, data_path=[corpus_path], vocab_size=2500, tokenizer_path=tokenizer_path)

Tokenizer saved at: ../models/finbpe_tokenizer.json


In [8]:
data['sentiment'].value_counts()

sentiment
neutral     8951
positive    3988
negative    1841
Name: count, dtype: int64