In [1]:
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pytesseract
from multiprocessing import Pool
from pathlib import Path
from PIL import Image
import pdfplumber
from openai import OpenAI
from pandarallel import pandarallel

tqdm.pandas(desc="progress")
pandarallel.initialize(progress_bar=True, nb_workers=20)

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
import os
import base64
import pytesseract
from pdf2image import convert_from_path
from bs4 import BeautifulSoup

# Function to convert PDF to images
def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path, fmt='tiff')

# Function to convert image to hOCR using pytesseract
def image_to_hocr(image):
    return pytesseract.image_to_pdf_or_hocr(image, extension='hocr', lang='chi_tra')

# Function to convert hOCR to markdown
def hocr_to_markdown(hocr):
    soup = BeautifulSoup(hocr, 'html.parser')
    markdown_text = ""

    for line in soup.find_all('span', class_='ocr_line'):
        line_text = " ".join([word.get_text() for word in line.find_all('span', class_='ocrx_word')])
        markdown_text += f"{line_text}"

    return markdown_text

# Main function to convert PDF to Markdown
def pdf_to_markdown(pdf_path):
    images = pdf_to_images(pdf_path)
    markdown_text = ""

    for image in images:
        hocr = image_to_hocr(image)
        markdown_text += hocr_to_markdown(hocr)

    return markdown_text.replace(' ', '')

In [3]:
class DataLoader:
    def __init__(self, question_path, answer_path, source_path):
        self.question_path = question_path
        self.answer_path = answer_path
        self.source_path = source_path

        self.stock_dict = pd.read_csv(
            '/home/xunhaoz/PycharmProjects/RAGAndLLMInFinance/finance/stock_table.csv')[[
            '公司簡稱', '公司名稱']].set_index('公司簡稱').to_dict()['公司名稱']


    def convert_company_name_to_full(self, query_text):
        for key, value in self.stock_dict.items():
            if key in query_text and value not in query_text:
                query_text = query_text.replace(key, value)
        return query_text

    def convert_season_to_month(self, query_text):
        season_dict = {
            '第1季': '5月', '第2季': '8月', '第3季': '11月', '第4季': '3月',
            '第一季': '5月', '第二季': '8月', '第三季': '11月', '第四季': '3月'}
        
        for key, value in season_dict.items():
            key_pos = query_text.find(key)
            if key_pos != -1:
                query_text = query_text[:key_pos] + value + query_text[key_pos:]    
        return query_text

    def convert_AD_to_ROC_year(self, query_text):
        year_pattern = r'(\d{4})年'
        
        year_match = re.search(year_pattern, query_text)
        
        try:
            if year_match:
                year = year_match.group(1)
                year_pos = query_text.find(year)                
                roc_year = str(int(year) - 1911)
                query_text = query_text[:year_pos] + f"{roc_year}年" + query_text[year_pos:]
        except Exception as e:
            pass

        return query_text

    def get_question(self):
        pass

    def get_answer(self):
        pass

    def get_source(self):
        pass


class FinanceDataLoader(DataLoader):
    def __init__(self, question_path, answer_path, source_path, chunk_size, chunk_overlap):
        super().__init__(question_path, answer_path, source_path)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )

    def get_question(self):
        questions_example = json.load(open(self.question_path))['questions']
        questions_example = pd.DataFrame(questions_example).set_index('qid')
        questions_example = questions_example[questions_example['category'] == 'finance']

        # questions_example['query'] = questions_example['query'].apply(self.convert_company_name_to_full) # 0.72
        # questions_example['query'] = questions_example['query'].apply(self.convert_season_to_month) # 0.66
        # questions_example['query'] = questions_example['query'].apply(self.convert_AD_to_ROC_year) # 0.66

        return questions_example

    def get_answer(self):
        ground_truths_example = json.load(open(self.answer_path))['ground_truths']
        ground_truths_example = pd.DataFrame(ground_truths_example).set_index('qid')
        ground_truths_example = ground_truths_example[ground_truths_example['category'] == 'finance']
        return ground_truths_example

    def process_pdf_by_pdfplumber(self, pdf_path):
        pdf_text = ''
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                pdf_text += ''.join(page.extract_text())
               
        pdf_text = pdf_text.replace(' ', '')
        
        if pdf_text == '':
            pdf_text = pdf_to_markdown(pdf_path=pdf_path)
        
        return self.text_splitter.split_text(pdf_text) 

    def get_source(self):
        pdf_files = pd.DataFrame({'path': Path(self.source_path).glob('*.pdf')})
        pdf_files['pid'] = pdf_files['path'].apply(lambda x: int(x.stem))
        pdf_files['content'] = pdf_files['path'].parallel_apply(self.process_pdf_by_pdfplumber)
        pdf_files = pdf_files.explode('content')

        return pdf_files[['pid', 'content']].set_index('pid')

In [4]:
finance_dataLoader = FinanceDataLoader(
    question_path='/home/xunhaoz/PycharmProjects/RAGAndLLMInFinance/questions_preliminary.json',
    answer_path='../contest_dataset/contest_dataset/dataset/preliminary/ground_truths_example.json',
    source_path='../contest_dataset/contest_dataset/reference/finance',
    chunk_size=512,
    chunk_overlap=16
)

answer = finance_dataLoader.get_answer()
question = finance_dataLoader.get_question()
source = finance_dataLoader.get_source()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=52), Label(value='0 / 52'))), HBox…

In [5]:
class DataPreprocessor:
    def __init__(self, model="albert-tiny"):
        self.ws_driver = CkipWordSegmenter(model=model, device=0)
        self.pos_driver = CkipPosTagger(model=model, device=0)
        self.ner_driver = CkipNerChunker(model=model, device=0)

    def preprocess(self, series):
        data_frame = series.to_frame()

        data_frame['content_ws'] = self.ws_driver(data_frame[series.name])
        data_frame['content_pos'] = self.pos_driver(data_frame['content_ws'])
        data_frame['content_ner'] = [
            [ner.word for ner in ner_list if len(ner.word) > 1] for ner_list in self.ner_driver(series)]

        clean_ws_list = []
        for content_ws, content_pos, content_ner in zip(
                data_frame['content_ws'], data_frame['content_pos'], data_frame['content_ner']):
            clean_ws = []
            for ws, pos in zip(content_ws, content_pos):
                if pos.startswith('V') or pos.startswith('N') or ws in content_ner:
                    if len(ws) > 1:
                        clean_ws.append(ws)
            clean_ws_list.append(clean_ws)
        data_frame['clean_ws'] = clean_ws_list

        return data_frame['clean_ws']

In [6]:
data_preprocessor = DataPreprocessor("albert-tiny")
source['clean_ws'] = data_preprocessor.preprocess(source['content'])
question['clean_ws'] = data_preprocessor.preprocess(question['query'])

Tokenization: 100%|██████████| 9253/9253 [00:03<00:00, 2993.38it/s]
Inference: 100%|██████████| 41/41 [00:24<00:00,  1.70it/s]
Tokenization: 100%|██████████| 9253/9253 [00:01<00:00, 5179.75it/s]
Inference: 100%|██████████| 234/234 [02:20<00:00,  1.67it/s]
Tokenization: 100%|██████████| 9253/9253 [00:03<00:00, 2984.08it/s]
Inference: 100%|██████████| 41/41 [00:24<00:00,  1.66it/s]
Tokenization: 100%|██████████| 300/300 [00:00<00:00, 49799.79it/s]
Inference: 100%|██████████| 2/2 [00:00<00:00, 21.34it/s]
Tokenization: 100%|██████████| 300/300 [00:00<00:00, 62832.88it/s]
Inference: 100%|██████████| 3/3 [00:00<00:00, 20.48it/s]
Tokenization: 100%|██████████| 300/300 [00:00<00:00, 48941.70it/s]
Inference: 100%|██████████| 2/2 [00:00<00:00, 21.34it/s]


In [7]:
class Retrieval:
    """
    Random retrieval class. Top 1 retrieval is 0.02 acc. 
    """

    def retrieval(self, question, source):
        question['retrieve'] = question['source'].apply(lambda x: np.random.choice(x))
        return question


class BM25Retrieval(Retrieval):
    def __init__(self):
        pass

    def retrieval(self, question, source):
        retrieve_list = []
        for record in question.to_dict(orient='records'):
            candidate_df = source.loc[record['source']].copy()
            bm25 = BM25Okapi(candidate_df['clean_ws'])
            candidate_df['score'] = bm25.get_scores(record['clean_ws'])
            retrieve_list.append(candidate_df.sort_values('score', ascending=False).index[0])

        question['retrieve'] = retrieve_list
        return question

In [8]:
retrieval = BM25Retrieval()
question = retrieval.retrieval(question, source)


In [9]:
question[['retrieve']].to_csv('../finance_512.csv')

In [10]:
(question['retrieve'] == answer['retrieve']).mean()

ValueError: Can only compare identically-labeled Series objects

In [None]:
"""
256: 0.78
512: 0.72
1024: 0.72
1536: 0.72 
1792: 0.74 
2048: 0.78
"""

'\n256: 0.\n512: 0.72\n1024: 0.72\n1536: 0.72 \n1792: 0.74 \n2048: 0.78\n'