In [126]:
import pandas as pd
import numpy as np
import os
from io import StringIO 
import json
from collections import Counter


from enum import Enum

from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.output_parsers import JsonOutputParser
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS, Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_pinecone import PineconeVectorStore
from time import perf_counter, process_time

from utils import *

%reload_ext autoreload
%autoreload 2

from openai import OpenAI


In [2]:
MODEL = 'gpt-3.5-turbo-0301'
BASE_FOLDER = "./test_data"
QUESTION_FILE =  "document_questions.xlsx"
RAW_DATA_FOLDER = "raw_text"

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [4]:
df = pd.read_excel(os.path.join(BASE_FOLDER, QUESTION_FILE))
docs = retrieve_pdf_docs(os.path.join(BASE_FOLDER,'pdfs'))

# DB Speed Comparison

Compares the query/vector retrieval times for the 3 different databases tested : FAISS, Chroma, Pinecone

In [8]:
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
DATABASE  = 'faiss'
K = 3

In [51]:
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
text_splitter = SemanticChunker(OpenAIEmbeddings())
splits = text_splitter.split_documents(docs)

In [54]:
retriever = get_retriever(splits, 'faiss',K)

In [55]:
execution_times = []
for i in range(10):
    question = df['relevant questions'].iloc[i]
    start_time = perf_counter()
    context = retriever.invoke(question)
    execution_times.append( perf_counter() - start_time )

print(f"Avg Query Time : {np.round(np.mean(execution_times),3)} +/- {np.round(np.std(execution_times),2)}")

Avg Query Time : 0.143 +/- 0.02


# Splitting Strategy Comparison

Performs a qualitative comparison of different chunking/splitting strategies

#### Generating sets of different splits

In [74]:
semantic_text_splitter = SemanticChunker(OpenAIEmbeddings())
recursive_text_splitter1 = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
recursive_text_splitter2 = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)

splits_map = {}

splits = semantic_text_splitter.split_documents(docs)
splits_map['semantic'] = splits

splits = recursive_text_splitter1.split_documents(docs)
splits_map['recursive_1000_200'] = splits

splits = recursive_text_splitter2.split_documents(docs)
splits_map['recursive_2000_300'] = splits





#### Evaluating responses to individual questions

In [67]:
question_id = 2

question = df['relevant questions'].iloc[question_id]
print("*"*30)
print(f"QUESTION : {question}")
for key in splits_map.keys():
    
    retriever = get_retriever(splits_map[key], 'faiss',K)
    context = retriever.invoke(question)
    print("*"*10)
    print(f"{key} : \n{context[0].page_content}")
    

******************************
QUESTION : What was the revolution witnessed by finance in the mid to later part of the 20th century?
**********
semantic : 
Second there is a large amount of ﬁnancial data available for testing. It will be interesting to see if,
sometime in the future, ﬁnancial economists eventually replace the stylized theories of equilibrium market
dynamics with a more realistic picture of the continuing struggle of learning and adapting agents who push
markets in the direction of eﬃciency, even though they never quite reach this goal. 42
**********
recursive_1000_200 : 
1 Introduction
In the mid to later part of the 20th century, ﬁnance witnessed a revolution. The advent of the eﬃcient
markets hypothesis, the capital asset pricing model, and the Black/Scholes options pricing formula put the
ﬁeld on a new, solid scientiﬁc foundation. This world was built on the assumption that asset markets were
powerful computational engines, and were able to aggregate and process the

In [75]:
question_id = 4

question = df['relevant questions'].iloc[question_id]
print("*"*30)
print(f"QUESTION : {question}")
for key in splits_map.keys():
    
    retriever = get_retriever(splits_map[key], 'faiss',K)
    context = retriever.invoke(question)
    print("*"*10)
    print(f"{key} : \n{context[0].page_content}")
    

******************************
QUESTION : Why are the financial markets appealing applications for agent-based methods?
**********
semantic : 
Unfortunately, this may also make them diﬃcult to estimate
using traditional econometric tools. Agent-based modelers should be aware of these nonlinear issues, and
take them into account when evaluating market simulations. Financial markets are an important challenge for agent-based computational modelers. Financial markets
may be one of the important early areas where agent-based methods show their worth, for two basic reasons. 41
**********
recursive_1000_200 : 
behavior. Several of the models covered here have already done this, and more examples of using experiments
are given in Duﬀy (2005). Finance experiments are particularly appealing since they often can be done at
time scales that are reasonable for the real data. It is more credible that you can simulate a day of trading
in the laboratory, than to simulate someone’s entire life cycle.


In [85]:
question_id = 20

question = df['relevant questions'].iloc[question_id]
print("*"*30)
print(f"QUESTION : {question}")
for key in splits_map.keys():
    
    retriever = get_retriever(splits_map[key], 'faiss',K)
    context = retriever.invoke(question)
    print("*"*10)
    print(f"{key} : \n{context[0].page_content}")
    

******************************
QUESTION : The participating schools of the Traffic Bowl Competition were from what all states in the US?
**********
semantic : 
This year we had 52 students from six universities attending the event. Par ticipating schools were: 
/square4 Portland State University 
/square4 University of Idaho 
/square4 University of Portland 
/square4 Oregon State University 
/square4 University of Washington 
/square4 Oregon Institute of Technology 
 
University of Portland took home the grand prize of bragging rights , a trophy and a $400 scholarship 
award. University of Washington and Oregon Institute of Technology  both received $300 for tying 
for second place. The remaining three participating schools rec eived a $150 participation award. All 
of the student attendees received a free dinner at this ev ent. In order to offset the cost of the student 
meals and scholarships, several local companies donated funds to  support student attendance. Donations were receiv

# Self Eval Result Evaluation

Deep dive analysis into the results of the Self eval experiments

In [133]:
df_list = []
for chunks in [[1000,200],[2000,200],[500,200]]:
    for k in [1,2,3,4,5]:

        chunk_size,chunk_overlap = chunks

        file_name = f'output_{chunk_size}_{chunk_overlap}_{k}_faiss.csv'

        df = pd.read_csv(os.path.join('./results/',file_name) )
        df['chunk_size'] = chunk_size
        df['chunk_overlap'] = chunk_overlap
        df['k'] = k
        df['id'] = list(range(len(df)))
        df_list.append(df.copy())
        

In [134]:
df = pd.concat(df_list,axis =0)

In [135]:
len(df)

495

In [136]:
eval(df.grade.iloc[0])

['Correct', 'Correct', 'Correct']

In [137]:
df.head()

Unnamed: 0,question,answer,context,grade,eval_description,chunk_size,chunk_overlap,k,id
0,"What is meant by ""computational finance""?",Computational finance involves using computati...,to ﬁnancial economists as it is potentially un...,"['Correct', 'Correct', 'Correct']",['The answer provides a clear and concise expl...,1000,200,1,0
1,What is meant by 'investor heterogeneity'?,'Investor heterogeneity' refers to the diversi...,from other more general heterogeneous agent mo...,"['Correct', 'Correct', 'Correct']","[""The answer provides a clear and concise expl...",1000,200,1,1
2,What was the revolution witnessed by finance i...,The revolution witnessed by finance in the mid...,1 Introduction\nIn the mid to later part of th...,"['Correct', 'Correct', 'Correct']",['The answer provides a detailed explanation o...,1000,200,1,2
3,Why do you think financial markets are viewed ...,Financial markets are viewed as interacting gr...,Financial markets are particularly appealing a...,"['Correct', 'Correct', 'Correct']",['The answer provides a clear explanation of w...,1000,200,1,3
4,Why are the financial markets appealing applic...,Financial markets are appealing applications f...,are given in Duﬀy (2005). Finance experiments ...,"['Correct', 'Correct', 'Correct']",['The answer provides a clear explanation of w...,1000,200,1,4


In [138]:
df['final_grade'] = df.grade.apply(lambda x : Counter(eval(x)).most_common()[0][0])

df['final_grade'] = df.final_grade.apply(lambda x : 1 if x=='Correct' else 0)

In [228]:
out = df.groupby(['k']).final_grade.mean().reset_index()
out.columns = ['k','accuracy']
out

Unnamed: 0,k,accuracy
0,1,0.878788
1,2,0.848485
2,3,0.848485
3,4,0.828283
4,5,0.858586


In [230]:
out = df.groupby(['chunk_size']).final_grade.mean().reset_index()
out.columns = ['chunk_size','accuracy']
out

Unnamed: 0,chunk_size,accuracy
0,500,0.848485
1,1000,0.842424
2,2000,0.866667


In [141]:
df.groupby(['chunk_size','k']).final_grade.mean().reset_index()

Unnamed: 0,chunk_size,k,final_grade
0,500,1,0.878788
1,500,2,0.848485
2,500,3,0.848485
3,500,4,0.818182
4,500,5,0.848485
5,1000,1,0.909091
6,1000,2,0.818182
7,1000,3,0.848485
8,1000,4,0.787879
9,1000,5,0.848485


In [143]:
df.groupby(['id']).final_grade.mean().sort_values()

id
7     0.000000
15    0.200000
24    0.200000
14    0.333333
20    0.400000
9     0.533333
27    0.600000
12    0.933333
11    0.933333
21    1.000000
22    1.000000
0     1.000000
25    1.000000
26    1.000000
28    1.000000
29    1.000000
30    1.000000
23    1.000000
19    1.000000
16    1.000000
17    1.000000
31    1.000000
13    1.000000
10    1.000000
8     1.000000
6     1.000000
5     1.000000
4     1.000000
3     1.000000
2     1.000000
1     1.000000
18    1.000000
32    1.000000
Name: final_grade, dtype: float64

In [183]:
question_id = 20
df[df.id==question_id]

Unnamed: 0,question,answer,context,grade,eval_description,chunk_size,chunk_overlap,k,id,final_grade
20,The participating schools of the Traffic Bowl ...,The participating schools of the Traffic Bowl ...,The Student Liaison Committee had another succ...,"['Incorrect', 'Correct', 'Correct']",['The answer provided is partially correct as ...,1000,200,1,20,1
20,The participating schools of the Traffic Bowl ...,The participating schools in the Traffic Bowl ...,The Student Liaison Committee had another succ...,"['Incorrect', 'Correct', 'Incorrect']",['The answer provided is partially correct as ...,1000,200,2,20,0
20,The participating schools of the Traffic Bowl ...,The participating schools of the Traffic Bowl ...,The Student Liaison Committee had another succ...,"['Incorrect', 'Incorrect', 'Incorrect']",['The answer provided is partially correct as ...,1000,200,3,20,0
20,The participating schools of the Traffic Bowl ...,The participating schools of the Traffic Bowl ...,The Student Liaison Committee had another succ...,"['Incorrect', 'Incorrect', 'Incorrect']",['The answer provided is partially correct as ...,1000,200,4,20,0
20,The participating schools of the Traffic Bowl ...,The participating schools of the Traffic Bowl ...,The Student Liaison Committee had another succ...,"['Correct', 'Correct', 'Correct']",['The answer provides a clear and concise list...,1000,200,5,20,1
20,The participating schools of the Traffic Bowl ...,The participating schools of the Traffic Bowl ...,The Student Liaison Committee had another succ...,"['Incorrect', 'Incorrect', 'Incorrect']",['The answer provided is partially correct as ...,2000,200,1,20,0
20,The participating schools of the Traffic Bowl ...,The participating schools in the Traffic Bowl ...,The Student Liaison Committee had another succ...,"['Correct', 'Correct', 'Correct']",['The answer provides a detailed list of the s...,2000,200,2,20,1
20,The participating schools of the Traffic Bowl ...,The participating schools in the Traffic Bowl ...,The Student Liaison Committee had another succ...,"['Correct', 'Correct', 'Correct']",['The answer provides a detailed list of the p...,2000,200,3,20,1
20,The participating schools of the Traffic Bowl ...,The participating schools in the Traffic Bowl ...,The Student Liaison Committee had another succ...,"['Correct', 'Correct', 'Correct']",['The answer provides a comprehensive list of ...,2000,200,4,20,1
20,The participating schools of the Traffic Bowl ...,The participating schools in the Traffic Bowl ...,The Student Liaison Committee had another succ...,"['Correct', 'Correct', 'Correct']",['The answer provides a detailed list of the p...,2000,200,5,20,1


In [222]:
row_id = 9
df[df.id==question_id]['question'].iloc[row_id]

'The participating schools of the Traffic Bowl Competition were from what all states in the US?'

In [223]:
df[df.id==question_id]['chunk_size'].iloc[row_id]

2000

In [224]:
df[df.id==question_id]['answer'].iloc[row_id]

"The participating schools in the Traffic Bowl Competition were from six states in the US: Oregon, Washington, Idaho, California, Arizona, and Nevada. The event was held at McMenamin's Edgefield in Portland, Oregon on November 15, 2007. The participating schools were Portland State University, University of Idaho, University of Portland, Oregon State University, University of Washington, and Oregon Institute of Technology."

In [225]:
df[df.id==question_id]['grade'].iloc[row_id]

"['Correct', 'Correct', 'Correct']"

In [226]:
df[df.id==question_id]['context'].iloc[row_id]

"The Student Liaison Committee had another successful year in attracting schools from the Pacific \nNorthwest to compete in the Oregon Section's annual Traffic Bowl Competition. The Traffic Bowl is \na Jeopardy-based trivia contest where students must answer ques tions on a variety of traffic and \ntransportation trivia. This year the competition was held on Novem ber 15, 2007 at McMenamin's \nEdgefield just east of Portland, Oregon.  \nThis year we had 52 students from six universities attending the event. Par ticipating schools were: \n/square4 Portland State University \n/square4 University of Idaho \n/square4 University of Portland \n/square4 Oregon State University \n/square4 University of Washington \n/square4 Oregon Institute of Technology \n \nUniversity of Portland took home the grand prize of bragging rights , a trophy and a $400 scholarship \naward. University of Washington and Oregon Institute of Technology  both received $300 for tying \nfor second place.  The remaining th