In [None]:
!pip install -q langchain
!pip install -q sentence-transformers
!pip install -q faiss-cpu
!pip install pypdf
!pip install -q openai
!pip install -q tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.1/803.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.7/205.7 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.7/46.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import pandas as pd
import openai
import warnings
warnings.filterwarnings('ignore')
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader,PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter,TextSplitter,CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings

In [None]:
clean_df = pd.read_json('path_to_cleaned_json')

In [None]:
list_of_documents=[]
 
for clean_exp in clean_df.head(1000).clean_explanation.values:
  text_splitter = CharacterTextSplitter(separator='\n',chunk_size=256,chunk_overlap=16)s
  list_of_documents.extend(text_splitter.split_documents([Document(page_content=clean_exp)]))

In [None]:
for pdf in ["pdf1_path","pdf2_path"]:
  loader = PyPDFLoader(pdf)
  pages = loader.load_and_split()
  list_of_documents.extend(text_splitter.split_documents(pages))

In [None]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [None]:
db = FAISS.from_documents(list_of_documents, embeddings)

In [None]:
os.environ["OPENAI_API_KEY"] = "api_key"

In [None]:
retriever = db.as_retriever(
    search_type="mmr", search_kwargs={"k": 10}
)
# create a chain to answer questions
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(temperature=0.5),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
)

In [None]:
qa("For the question"+ "Mountain waves should be expected..? choose the correct answer from the following answers 1 directly over the mountain range,2 on the downwind side of the mountain range.3 on the upwind side of the mountain range.4 when instability is high.")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'For the questionMountain waves should be expected..? choose the correct answer from the following answers 1 directly over the mountain range,2 on the downwind side of the mountain range.3 on the upwind side of the mountain range.4 when instability is high.',
 'result': ' 2. on the downwind side of the mountain range.',
 'source_documents': [Document(page_content='10.  A mountain range is aligned in an east/west direction. Select the conditions from \nthe table below that will give rise to mountain waves:\n 2000 ft  5000 ft  10 000 ft \na. 020/40  020/30  020/50 \nb. 170/20  190/40  210/60\nc. 270/15  270/20  270/40\nd. 090/20  090/40  090/60\n11.  For mountain waves to form, the wind direction must be near perpendicular to a \nridge or range of mountains and the speed must:\na. decrease with height within a stable layer above the hill', metadata={'source': '/content/vdoc.pub_cae-oxford-aviation-academy-atpl-book-9-meteorology.pdf', 'page': 126}),
  Document(page_content='sta

In [None]:
answers=[]
sources=[]
from tqdm import tqdm
for i in tqdm(range(1000,1452)):
  question=clean_df['question'].iloc[i]
  option1=clean_df['answers'].iloc[i][0]['answer']
  option2=clean_df['answers'].iloc[i][1]['answer']
  option3=clean_df['answers'].iloc[i][2]['answer']
  option4=clean_df['answers'].iloc[i][3]['answer']
  # question="For the question"+ question +"choose the correct answer from the following answers" + option1 +","+ option2 +","+ option3 + ","+option4
  question="You are an aspiring Pilot. To get the Pilot license you have to clear the exam which consists of meterology, physics and maths topics. For the question: "+ question +" ,return only the exact string of the correct answer from the following options: " + option1 +", "+ option2 +", "+ option3 + ", "+option4
  response = qa({"query": question},return_only_outputs=True)
  sources.append(response['source_documents'])
  #print(response['result'])
  answers.append(response['result'].lower().replace(clean_df['question'].iloc[i].lower(),'').strip())

In [None]:
columns = ['predicted_answer', 'groundtruth', 'matches?','options','source']  # Replace with your column names
result_df = pd.DataFrame(columns=columns)

In [None]:
result_df['groundtruth'] = result_df['groundtruth'].str.lower()

In [None]:
import string
cnt=0
j=0
for i in range(len(answers)):
  matches=0
  if answers[j].lower().strip().translate(str.maketrans('', '', string.punctuation)).replace('.','')==clean_df.tail(452)['correct_answer'].values[i].lower().strip().translate(str.maketrans('', '', string.punctuation)).replace('.',''):
    cnt+=1
    matches=1

  row_data = {'source': sources[j], 'predicted_answer': answers[j], 'groundtruth': clean_df.tail(452)['correct_answer'].values[i], 'matches?': matches, 'options':clean_df.tail(452)['answers'].values[i]}  # Replace with your values
  result_df = result_df.append(row_data, ignore_index=True)
  j+=1
print(str(cnt)+" out of " + str(len(answers))+" are correct")

229 out of 389 are correct


In [4]:

def rouge_l_score(reference, generated):
    smoothing_function = SmoothingFunction().method1  # Choose a smoothing function

    # Tokenize the strings into lists of words
    reference_tokens = reference.split()
    generated_tokens = generated.split()

    # Compute ROUGE-L score
    rouge_l = sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing_function)

    return rouge_l


def map_to_option_id_gd(row):
  target_answer=row['groundtruth']
  answer_options=row['options']
  for option in answer_options:
      if target_answer.lower() in option['answer'].lower():
          return int(option['id'])
  return None

def map_to_option_id_pred(row):
  target_answer=row['predicted_answer']
  answer_options=row['options']
  maxrouge=0
  id=None
  for option in answer_options:
      rouge_lscore=rouge_l_score(option['answer'].lower(),target_answer.lower())
      if rouge_lscore > maxrouge:
          maxrouge=rouge_lscore
          id=int(option['id'])
  return id

In [5]:
result_df['result_id_groundtruth'] = result_df.apply(map_to_option_id_gd, axis=1)

In [6]:
result_df['result_id_prediction']=result_df.apply(map_to_option_id_pred, axis=1)

In [7]:
result_df['matches_ids']=result_df.result_id_groundtruth==result_df.result_id_prediction

In [11]:
result_df.matches_ids.value_counts()[True]/len(result_df)

0.6084070796460177