In [306]:
import pandas as pd
import numpy as np
import os
from io import StringIO 
import json



from enum import Enum

from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.output_parsers import JsonOutputParser
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS, Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_pinecone import PineconeVectorStore


from utils import *
from main import *

%reload_ext autoreload
%autoreload 2

from openai import OpenAI


In [307]:
MODEL = 'gpt-3.5-turbo-0301'
BASE_FOLDER = "./test_data"
QUESTION_FILE =  "document_questions.xlsx"
RAW_DATA_FOLDER = "raw_text"

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Testing get_context function

In [308]:
df = pd.read_excel(os.path.join(BASE_FOLDER, QUESTION_FILE))

In [309]:
df.head()

Unnamed: 0,document,relevant questions
0,DR--185549702_INTRO,"What is meant by ""computational finance""?"
1,,What is meant by 'investor heterogeneity'?
2,,What was the revolution witnessed by finance i...
3,,Why do you think financial markets are viewed ...
4,,Why are the financial markets appealing applic...


In [310]:
docs = retrieve_pdf_docs(os.path.join(BASE_FOLDER,'pdfs'))

In [311]:
question = df['relevant questions'].iloc[0]
question

'What is meant by "computational finance"?'

### Querying with single splitting strategy

In [312]:
splitting_strategy = {'type':'recursive','params':{'chunk_size':1000,'chunk_overlap':200}}

context = retrieve_context(docs, splitting_strategy, question, database = 'faiss', k = 3)

In [313]:
context

[Document(page_content='to ﬁnancial economists as it is potentially unappealing to ﬁnancial practitioners.1It is interesting to note\nthat these foundations came with a very important computational dimension. The early availability of large\nmachine-readable data sets, and the computational power to analyze them, laid the critical foundation for\nthis new ﬁnancial rigor.2In agent-based computational models the computer is once again at the center\nof a change in thinking about ﬁnancial markets. This time it is helping to pursue a world view in which\nagents may diﬀer in many ways, not just in their information, but in their ability to process information,\ntheir attitudes toward risk, and in many other dimensions.\nModels in the realm of agent-based computational ﬁnance view ﬁnancial markets as interacting groups\nof learning, boundedly-rational agents. The computer may or may not be a necessary tool to understand\nthe dynamics of these markets. This survey will concentrate on the case

### Querying with multiple splitting strategy

In [314]:
splitting_strategy = [{'type':'recursive','params':{'chunk_size':1000,'chunk_overlap':200}},
                        {'type':'semantic','params':None},
                     ]

context = retrieve_context(docs, splitting_strategy, question, database = 'faiss', k = 3)

In [221]:
context

[Document(page_content='to ﬁnancial economists as it is potentially unappealing to ﬁnancial practitioners.1It is interesting to note\nthat these foundations came with a very important computational dimension. The early availability of large\nmachine-readable data sets, and the computational power to analyze them, laid the critical foundation for\nthis new ﬁnancial rigor.2In agent-based computational models the computer is once again at the center\nof a change in thinking about ﬁnancial markets. This time it is helping to pursue a world view in which\nagents may diﬀer in many ways, not just in their information, but in their ability to process information,\ntheir attitudes toward risk, and in many other dimensions.\nModels in the realm of agent-based computational ﬁnance view ﬁnancial markets as interacting groups\nof learning, boundedly-rational agents. The computer may or may not be a necessary tool to understand\nthe dynamics of these markets. This survey will concentrate on the case

# Running LLM Self Eval Tests

In this section, you can run the LLM self eval pipeline using the Experiment class. Given the source pdfs, splitting strategy and evaluation questions, this pipeline computes the context and the subsequent answer from the LLM using the context, for every question. The question-answer pair is then evaluated by an LLM and classified as either Correct or Incorrect, based on the LLM's assessment. 

## Running a single experiment

In [302]:
splitting_strategy = {'type':'recursive','params':{'chunk_size':500,'chunk_overlap':100}}

exp = Experiment(df = df,
                 docs = docs,
                 splitting_strategy= splitting_strategy,
                 database = 'faiss',
                 k = 3,
                 num_evals=3
                )

In [303]:
exp.run_test()

100%|███████████████████████████████████████████| 33/33 [02:33<00:00,  4.66s/it]

Accuracy : 0.85





## Running multiple experiments 

In [305]:
for chunks in [[1000,200],[2000,200],[500,200]]:
    for k in [5]:

        chunk_size, chunk_overlap = chunks 
        splitting_strategy = {'type':'recursive','params':{'chunk_size':chunk_size,'chunk_overlap':chunk_overlap}}
        
        exp = Experiment(df = df,
                 docs = docs,
                 splitting_strategy= splitting_strategy,
                 database = 'faiss',
                 k = k,
                 num_evals = 3
                )
        
           
        exp.run_test()
        

100%|███████████████████████████████████████████| 33/33 [02:45<00:00,  5.01s/it]


Accuracy : 0.85


100%|███████████████████████████████████████████| 33/33 [02:40<00:00,  4.87s/it]


Accuracy : 0.88


100%|███████████████████████████████████████████| 33/33 [02:41<00:00,  4.91s/it]

Accuracy : 0.85



