# Libraries and installation

In [1]:
!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu
!pip install -q pypdf
!pip install -q nltk

In [2]:
# Import libraries 
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter, CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain import HuggingFacePipeline
from langchain.schema import Document
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


# **Load: Clean-data** 

In [3]:
DIR  = Path('/home/shsingh/knowledge_distillation/bliq/final_submission')

file_path  = DIR / 'cleanquest.json'
clean_df=pd.read_json(str(file_path))

## Split df into train and test:

> train = clean_df.head(1000)


> test = clean_df.tail(452)

In [None]:
train = clean_df.head(1000)
test = clean_df.tail(452)

# Creating **Knowledge db using FAISS**

> 1. **TextSplitter** 
> 2.  **chunk_size**

## 1. Use **Explanations** as context(knowledge db)

In [4]:
list_of_documents=[]

for clean_exp in train.clean_explanation.values:
  # text_splitter = RecursiveCharacterTextSplitter(chunk_size=128,chunk_overlap=16)#CharacterTextSplitter(sep = '\n')
  text_splitter = CharacterTextSplitter(separator="\n", chunk_size=512 ,chunk_overlap = 16)
  list_of_documents.extend(text_splitter.split_documents([Document(page_content=clean_exp)]))

Created a chunk of size 567, which is longer than the specified 512
Created a chunk of size 557, which is longer than the specified 512
Created a chunk of size 762, which is longer than the specified 512
Created a chunk of size 557, which is longer than the specified 512
Created a chunk of size 762, which is longer than the specified 512
Created a chunk of size 559, which is longer than the specified 512
Created a chunk of size 524, which is longer than the specified 512
Created a chunk of size 514, which is longer than the specified 512
Created a chunk of size 700, which is longer than the specified 512
Created a chunk of size 564, which is longer than the specified 512
Created a chunk of size 561, which is longer than the specified 512
Created a chunk of size 603, which is longer than the specified 512
Created a chunk of size 762, which is longer than the specified 512
Created a chunk of size 697, which is longer than the specified 512
Created a chunk of size 650, which is longer tha

## 2. Use **Meteorology books** as context(knowledge db)

In [5]:
# list_of_documents=[]
for pdf in ["/path_to/book1.pdf",'/path_to/book2.pdf"']:
  loader = PyPDFLoader(pdf)
  pages = loader.load_and_split()
  list_of_documents.extend(text_splitter.split_documents(pages))

## 3. Embedding Model

In [6]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

## 4. Create External-Knowledge db

In [7]:
db = FAISS.from_documents(list_of_documents, embeddings)

## 5. Preventing data leakage:

Filter redundant information present in the external knowledge db: 

remove based on --> 'Metadata': Page no.

In [8]:
pages_toremove=[15,16,17,18,19, 28, 29, 30, 31, 32, 33,40,41, 54, 55, 56, 57, 58,59,76,77,78,79,88,89, 90,91,102,103,104,105, 106,107,124,125,126,127,128,129,143,144,145,146,147,148,149,150,151,152,153,174,175,176,177,178,179,180,181,191,192,193,194,195,196,211,212,213,214,215,216,217,218,219,230,231,232,234,235,251,252,253,254,270,271,283,284,285,286,287,288,289,301,302,303,304,305,323,324,325,326,327,328,329,330,331,339,340,341,342,343,344,345,346,347,348,366,367,368,369,370,371,397,398,399,400,401,402,403,404,405,406,407,408,409,418,419,486,487,488,489,490,496,497,498,499,500,512,513,514,515,516,517,518,519,520,528,529,530,531,532,533,534,535,536,537,538,539,540,541]+[num for num in range(550,664)]
keys_to_remove=[]
for key in db.docstore._dict.keys():
  if 'page' in db.docstore._dict[key].metadata.keys() and db.docstore._dict[key].metadata['source']=='/home/shsingh/knowledge_distillation/bliq/ref_books/vdoc.pub_cae-oxford-aviation-academy-atpl-book-9-meteorology.pdf':
    if db.docstore._dict[key].metadata['page'] in pages_toremove:
      keys_to_remove.append(key)

In [9]:
db.delete(keys_to_remove)

True

# 6. **LLM** model

In [10]:
model_name = "google/flan-t5-xxl"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=1024)

# Define a question-answering pipeline using the model and tokenizer
question_answerer = pipeline(
    "text2text-generation",
    model=model_name,
    tokenizer=tokenizer
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0, "max_length": 1024},
)

Loading checkpoint shards: 100%|██████████| 5/5 [00:06<00:00,  1.23s/it]


## 7. **RAG**: Retriever

In [11]:
retriever = db.as_retriever(
    search_type="similarity", search_kwargs={"k": 5}
)
# create a chain to answer questions
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
)

# 8. **RAG system**: INFERENCE

In [None]:
answers=[]
sources=[]
from tqdm import tqdm
for i in tqdm(range(1000,1452)):
  question=clean_df['question'].iloc[i]
  option1=clean_df['answers'].iloc[i][0]['answer']
  option2=clean_df['answers'].iloc[i][1]['answer']
  option3=clean_df['answers'].iloc[i][2]['answer']
  option4=clean_df['answers'].iloc[i][3]['answer']
  question="For the question"+ question +"choose the correct answer from the following answers" + option1 +","+ option2 +","+ option3 + ","+option4
  response = qa({"query": question},return_only_outputs=True)
  #print(response['result'])
  sources.append(response['source_documents'])
  answers.append(response['result'].lower().replace(clean_df['question'].iloc[i].lower(),'').strip())

# Post processing and Evaluation metric

In [None]:
import pandas as pd

# Step 1: Create an empty DataFrame
columns = ['predicted_answer', 'groundtruth', 'matches?','options','sources','questions']  # Replace with your column names
result_df = pd.DataFrame(columns=columns)

### String-based matching:

> correct_answer and predicted_answer

In [None]:
import string

cnt=0
j=0
for i in range(1000,1452):
  matches=0
  if answers[j].lower().strip().translate(str.maketrans('', '', string.punctuation)).replace('.','')==clean_df['correct_answer'].values[i].lower().strip().translate(str.maketrans('', '', string.punctuation)).replace('.',''):
    cnt+=1
    matches=1
  else:
    print('wrong answer index is ')
    print(i)
  row_data = {'sources':sources[j],'predicted_answer': answers[j], 'groundtruth': clean_df['correct_answer'].values[i], 'matches?': matches, 'options':clean_df['answers'].values[i]}
  result_df = result_df.append(row_data, ignore_index=True)
  j+=1
print(str(cnt)+" out of " + str(len(answers))+" are correct")

wrong answer index is 
1004
wrong answer index is 
1005
wrong answer index is 
1006
wrong answer index is 
1008
wrong answer index is 
1011
wrong answer index is 
1012
wrong answer index is 
1013
wrong answer index is 
1017
wrong answer index is 
1018
wrong answer index is 
1019
wrong answer index is 
1024
wrong answer index is 
1026
wrong answer index is 
1027
wrong answer index is 
1028
wrong answer index is 
1030
wrong answer index is 
1031
wrong answer index is 
1037
wrong answer index is 
1041
wrong answer index is 
1042
wrong answer index is 
1044
wrong answer index is 
1046
wrong answer index is 
1048
wrong answer index is 
1049
wrong answer index is 
1050
wrong answer index is 
1051
wrong answer index is 
1052
wrong answer index is 
1055
wrong answer index is 
1056
wrong answer index is 
1058
wrong answer index is 
1059
wrong answer index is 
1060
wrong answer index is 
1061
wrong answer index is 
1062
wrong answer index is 
1063
wrong answer index is 
1064
wrong answer index i

  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data,

wrong answer index is 
1095
wrong answer index is 
1098
wrong answer index is 
1099
wrong answer index is 
1100
wrong answer index is 
1105
wrong answer index is 
1106
wrong answer index is 
1108
wrong answer index is 
1111
wrong answer index is 
1113
wrong answer index is 
1117
wrong answer index is 
1118
wrong answer index is 
1119
wrong answer index is 
1120
wrong answer index is 
1121
wrong answer index is 
1122
wrong answer index is 
1123
wrong answer index is 
1124
wrong answer index is 
1125
wrong answer index is 
1126
wrong answer index is 
1127
wrong answer index is 
1128
wrong answer index is 
1129
wrong answer index is 
1130
wrong answer index is 
1131
wrong answer index is 
1133
wrong answer index is 
1134
wrong answer index is 
1135
wrong answer index is 
1136
wrong answer index is 
1137
wrong answer index is 
1140
wrong answer index is 
1141
wrong answer index is 
1142
wrong answer index is 
1144
wrong answer index is 
1146
wrong answer index is 
1147
wrong answer index i

  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data,

wrong answer index is 
1260
wrong answer index is 
1262
wrong answer index is 
1269
wrong answer index is 
1272
wrong answer index is 
1278
wrong answer index is 
1279
wrong answer index is 
1280
wrong answer index is 
1281
wrong answer index is 
1282
wrong answer index is 
1284
wrong answer index is 
1285
wrong answer index is 
1287
wrong answer index is 
1290
wrong answer index is 
1291
wrong answer index is 
1295
wrong answer index is 
1296
wrong answer index is 
1297
wrong answer index is 
1299
wrong answer index is 
1301
wrong answer index is 
1302
wrong answer index is 
1303
wrong answer index is 
1308
wrong answer index is 
1311
wrong answer index is 
1312
wrong answer index is 
1315
wrong answer index is 
1318
wrong answer index is 
1319
wrong answer index is 
1320
wrong answer index is 
1321
wrong answer index is 
1322
wrong answer index is 
1324
wrong answer index is 
1326
wrong answer index is 
1329
wrong answer index is 
1330
wrong answer index is 
1331
wrong answer index i

  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data,

wrong answer index is 
1372
wrong answer index is 
1373
wrong answer index is 
1374
wrong answer index is 
1375
wrong answer index is 
1376
wrong answer index is 
1378
wrong answer index is 
1379
wrong answer index is 
1380
wrong answer index is 
1381
wrong answer index is 
1387
wrong answer index is 
1389
wrong answer index is 
1390
wrong answer index is 
1393
wrong answer index is 
1395
wrong answer index is 
1396
wrong answer index is 
1398
wrong answer index is 
1400
wrong answer index is 
1401
wrong answer index is 
1402
wrong answer index is 
1404
wrong answer index is 
1409
wrong answer index is 
1411
wrong answer index is 
1413
wrong answer index is 
1420
wrong answer index is 
1422
wrong answer index is 
1424
wrong answer index is 
1427
wrong answer index is 
1429
wrong answer index is 
1438
wrong answer index is 
1440
wrong answer index is 
1444
wrong answer index is 
1445
wrong answer index is 
1446
wrong answer index is 
1447
wrong answer index is 
1448
wrong answer index i

  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)
  result_df = result_df.append(row_data, ignore_index=True)


### Evaluation:

Using groundthruth and predictions after mapping predicted_answers and correct_answers to their respective option_ids 

In [None]:


def rouge_l_score(reference, generated):
    smoothing_function = SmoothingFunction().method1  # Choose a smoothing function
    # Tokenize the strings into lists of words
    reference_tokens = reference.split()
    generated_tokens = generated.split()
    # Compute ROUGE-L score
    rouge_l = sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing_function)
    return rouge_l


# mapping the groundtruth string to option id using string match
def map_to_option_id_gd(row):
  target_answer=row['groundtruth']
  answer_options=row['options']
  for option in answer_options:
      if target_answer.lower() in option['answer'].lower():
          return int(option['id'])
  return None

#matching the predicted answer to option id by calculating rougel score 
#between the predicted answer and the options and the option with the
# maximum rougel score is returned as the predicted option
def map_to_option_id_pred(row):
  target_answer=row['predicted_answer']
  answer_options=row['options']
  maxrouge=0
  id=None
  for option in answer_options:
      rouge_lscore=rouge_l_score(option['answer'].lower(),target_answer.lower())
      if rouge_lscore > maxrouge:
          maxrouge=rouge_lscore
          id=int(option['id'])
  return id

In [None]:
result_df['result_id_groundtruth'] = result_df.apply(map_to_option_id_gd, axis=1)
result_df['result_id_prediction']=result_df.apply(map_to_option_id_pred, axis=1)
result_df['matches_ids']=result_df.result_id_groundtruth==result_df.result_id_prediction

# Accuracy

In [None]:
result_df.matches_ids.value_counts()[True]/len(result_df)

0.6084070796460177

In [None]:
result_df.matches_ids.value_counts()

True     275
False    177
Name: matches_ids, dtype: int64

In [None]:
result_df.to_csv('/home/shsingh/knowledge_distillation/bliq/phsae0/result_xxl_cz512_k5.csv')

In [None]:
result_df

Unnamed: 0,predicted_answer,groundtruth,matches?,options,sources,questions,result_id_groundtruth,result_id_prediction,matches_ids
0,valley inversion.,Valley inversion.,1,"[{'id': 1, 'answer': 'Valley inversion.'}, {'i...","[page_content='In valleys, inversions are also...",,1,1.0,True
1,increases exponentially.,Increases exponentially.,1,"[{'id': 1, 'answer': 'Increases linearly betwe...",[page_content='INCORRECT\n. Raising the air te...,,2,2.0,True
2,surface friction.,Horizontal pressure difference.,0,"[{'id': 1, 'answer': 'Earth rotation.'}, {'id'...",[page_content='sloping surface.\nWhilst fronts...,,2,3.0,False
3,11 to 50 km.,11 to 50 km.,1,"[{'id': 1, 'answer': '50 to 85 km.'}, {'id': 2...",[page_content=': The altitude where the strato...,,3,3.0,True
4,snow,Hail,0,"[{'id': 1, 'answer': 'Hail'}, {'id': 2, 'answe...",[page_content='It is difficult to see how the ...,,1,2.0,False
...,...,...,...,...,...,...,...,...,...
447,cloud base lower than 2 000 ft and colder air ...,Cloud base higher than 2 000 ft and colder air...,0,"[{'id': 1, 'answer': 'Cloud base higher than 2...",[page_content='Consequently the temperature at...,,1,2.0,False
448,altocumulus,Altocumulus,1,"[{'id': 1, 'answer': 'Altocumulus'}, {'id': 2,...",[page_content='Cloud Formation and Precipitati...,,1,1.0,True
449,maritime cold air flows over a warmer surface ...,a mild moist airstream flows over snow covered...,0,"[{'id': 1, 'answer': 'maritime warm air flows ...",[page_content='Wind speeds over 5 kt are suffi...,,2,3.0,False
450,frost forming around the fuel tanks due to col...,Frost forming around the fuel tanks due to col...,1,"[{'id': 1, 'answer': 'Mixed ice on the leading...","[page_content='frost\n, similar to hoar frost ...",,3,3.0,True


# LLM Inference results

| Embedding model | TextSplitter | chunk_size | list_of_documents | LLM model | retriever(top-k) | Accuracy | correct_answers |
|----------|----------|----------|----------|----------|----------|----------|----------|
| all-MiniLM-l6-v2  | RecursiveCharacter | 128   | explanation    | google/flan-t5-small   |  10  | 0.21   | 98 |
| all-MiniLM-l6-v2  | RecursiveCharacter | 128  | explanation + book1 + book2   | google/flan-t5-small  | 10 | 0.23  | 102  |
| all-MiniLM-l6-v2  | RecursiveCharacter  | 128  | explanation  | google/flan-t5-base  | 10  | 0.27  | 124 |
| all-MiniLM-l6-v2  | RecursiveCharacter  | 128  | explanation + book1 + book2  | google/flan-t5-base  |  10 | 0.30  | 138  |
| all-MiniLM-l6-v2  | RecursiveCharacter  | 128  | explanation  | google/flan-t5-large  |  5  | 0.42 | 194|
| all-MiniLM-l6-v2  | RecursiveCharacter  | 128  | explanation  | google/flan-t5-large  | 10  | 0.40 | 183 |
| all-MiniLM-l6-v2  | RecursiveCharacter  | 128  | explanation + book1 + book2 | google/flan-t5-large  | 10 | 0.451  | 204 |
| all-MiniLM-l6-v2  | Character  | 128  | explanation + book1 + book2 | google/flan-t5-large  | 10  | 0.45 | 208  |
| all-MiniLM-l6-v2  | RecursiveCharacter  | 128  | explanation  | google/flan-t5-xl  | 10  | 0.45  | 207 |
| all-MiniLM-l6-v2  | Character  | 128  | explanation + book1 + book2 | google/flan-t5-xl  | 10  | 0.53  | 214 |
| all-MiniLM-l6-v2  | Character  | 512  | explanation + book1 + book2 | google/flan-t5-xl  | 5  | 0.50  | 226 |
| all-MiniLM-l6-v2  | RecursiveCharacter  | 512  | explanation + book1 + book2 | google/flan-t5-xl  | 5  | 0.556  | 251 |
| all-MiniLM-l6-v2  | RecursiveCharacter  | 512  | explanation + book1 + book2 | **google/flan-t5-xxl**  | 5 | **0.60**  | 275 |
| all-MiniLM-l6-v2  | Character  | 256  | explanation + book1 + book2 | **gpt3.5**  | 10 | **0.62**  | 278 |

# List of documents:

1. explanation = explanations from the question.json
2. book1 = "vdoc.pub_cae-oxford-aviation-academy-atpl-book-9-meteorology"(https://vdoc.pub/documents/cae-oxford-aviation-academy-atpl-book-9-meteorology-479j0jlutkr0)
3. book2 = "aero_introductory_physics"(https://www.nasa.gov/wp-content/uploads/2015/09/aero_introductory_physics.pdf)