## PDF Query Using Langchain

In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

Collecting langchain
  Downloading langchain-0.1.13-py3-none-any.whl (810 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m810.5/810.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.29 (from langchain)
  Downloading langchain_community-0.0.29-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.33 (from langchain)
  Downloading langchain_core-0.1.36-py3-none-any.whl (273 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m273.9/273.9 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downl

In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [None]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [None]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('/content/Idioms-LLM.pdf')

In [None]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [None]:
raw_text



In [None]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 30,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

n = 50
chunk_size = len(texts) // n

texts = texts[:n*chunk_size]

In [None]:
texts[20]

'hand. see under ABOVE SUSPICION. \nabsence \nhand. In addition to the idiom beginning with ABSENCE, also see \nCONSPICUOUS BY ITS ABSENCE. \nabsence makes the heart grow fonder \nSeparation intensifies love, as in After a year in another country she accepted his \nproposal, so I guess absence makes the heart grow fonder,  or, used ironically, The \nboss leaves earlier every day; oh well, absence makes the heart grow fonder.\nAlthough versions of this saying date from Roman times, it only became popular \nafter Thomas Haynes Bayly used it as the last line of a song in The Isle of Beauty \n(1850). The opposite sentiment is expressed by FAMILIARITY BREEDS \nCONTEMPT. \nabsent without leave \nAway without permission or explanation, as in Her daughter went to the mall but'

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [None]:
document_search = FAISS.from_texts(texts, embeddings)

In [None]:
document_search


<langchain_community.vectorstores.faiss.FAISS at 0x7bf6e36b1720>

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

  warn_deprecated(


In [None]:
query = "absent without leave"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Away without permission or explanation, as in Her daughter went to the mall but got in trouble for being absent without leave.'

In [None]:
def fetch_and_process_idioms(alphabet):
    """
    Fetches idioms for the given alphabet letter and processes the response to extract idioms.
    """
    query = f"Give me 5 idioms for the letter {alphabet}. I need response in this format: <Alphabet>: <list of idioms separated by comma>"
    docs = document_search.similarity_search(query)
    response = chain.run(input_documents=docs, question=query)

    # Assuming the response is a string "<Alphabet>: <Idiom 1>, <Idiom 2>, ..."
    # Here's how you might process that string to extract the idioms:
    processed_response = response.split(":")[1].strip() if ":" in response else ""
    idioms = processed_response.split(", ") if processed_response else []

    return idioms

def append_idioms_to_file(idioms_collection, filename):
    with open(filename, 'a+') as f:
        f.seek(0)
        existing_lines = f.read().splitlines()

        for idioms in idioms_collection.values():
            for idiom in idioms:
                if idiom not in existing_lines:
                    f.write(idiom + '\n')


def main():
    alphabets = [chr(i) for i in range(65, 91)]  # A-Z in uppercase
    idioms_collection = {}
    for alphabet in alphabets:
        idioms = fetch_and_process_idioms(alphabet)
        idioms_collection[f"{alphabet}"] = idioms

        append_idioms_to_file(idioms_collection, 'idioms.txt')
        if idioms:
            print(f"{alphabet}: {', '.join(idioms)}")
        else:
            print(f"No idioms found for {alphabet}")

for i in range(30):
  if __name__ == "__main__":
      main()


A: all the best, apple of one's eye, as easy as pie, at the drop of a hat, a dime a dozen
B: down in the dumps, down with the new group, busy as a beaver, where's the beef, beef up, bee in one's bonnet
C: call it a day, catch red-handed, cry over spilled milk, cut corners, chip on one's shoulder
D: dig up dirt, dilemma, daily dozen, drag in, daggers drawn
E: every cloud has a silver lining, easy as pie, elephant in the room, early bird catches the worm, eat humble pie
F: fly off the handle, face the music, feather in one's cap, fall off the wagon, fight tooth and nail
G: give the shirt off one's back, give the slip, give the time of day, give the word, go against the grain
H: hay hand, haystack hand, haywire hand, hazard hand, haze hand
I: in a nutshell, in the limelight, in the loop, in the red, in the same boat
J: jumping the gun, joined at the hip, jump ship, jump on the bandwagon, jumping for joy
K: keep an eye out, kick the bucket, kill two birds with one stone, keep your chin up,

KeyboardInterrupt: 

## **Idioms to Context Generation**

In [None]:
import openpyxl
from openpyxl import Workbook

# Simulated function for fetching responses. Replace with actual API call.
def fetch_response(idiom):
    query = (f"Please provide a direct and succinct context for the idiom '{idiom}', "
             "followed by a concise sentence that uses the idiom. "
             "Format your answer as 'Context: [Brief scenario], Example: [Sentence using the idiom].'")
    docs = document_search.similarity_search(query)
    response = chain.run(input_documents=docs, question=query)
    return response

def process_response(response):
    # Split the response into context and example sentence
    parts = response.split(", Example: ")
    context = parts[0].replace("Context: ", "").strip()
    example_sentence = parts[1].strip() if len(parts) > 1 else ""
    return context, example_sentence

def main():
    idioms_file = "idioms.txt"
    wb = Workbook()
    ws = wb.active
    ws.append(["Idiom", "Context", "Example"])

    idioms_processed = 0
    save_interval = 5  # Save after every 5 responses

    with open(idioms_file, "r") as file:
        idioms = [line.strip() for line in file.readlines()]

    for idiom in idioms:
        response = fetch_response(idiom)
        context, example_sentence = process_response(response)
        ws.append([idiom, context, example_sentence])

        idioms_processed += 1
        if idioms_processed % save_interval == 0:
            wb.save("Idioms_Context.xlsx")
            print(f"Saved after {idioms_processed} idioms.")

    # Save any remaining idioms that didn't hit the save interval
    wb.save("Idioms_Context_Examples.xlsx")
    print("Final save completed.")

if __name__ == "__main__":
    main()


### Processing and Extracting Example

In [None]:
import pandas as pd

# Load the dataset from an Excel file
file_name = '/content/Idioms_Context.xlsx'
df = pd.read_excel(file_name, engine='openpyxl')

# Specify the column you want to extract text from
column = 'Context'

# Extract text after "Example: " and place it under the existing example column
df['Example'] = df['Example'].str.strip('"')
df['Example'] = df[column].str.split('Example: ').str[-1]

# Save the DataFrame back to the same Excel file
df.to_excel(file_name, index=False)


In [None]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])

In [None]:
query = "Explain me about Attention is all you need"
index.query(query)

' Attention is All You Need is a paper published in 2017 by researchers from Google Brain. The paper introduces the Transformer, a model architecture that relies entirely on an attention mechanism to draw global dependencies between input and output, instead of using recurrence. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs. Additionally, self-attention could yield more interpretable models.'