In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from PyPDF2 import PdfReader


def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text



def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks



def get_conversational_chain():
    prompt_template = """
    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
    provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
    Context:\n {context}?\n
    Question: \n{question}\n

    Answer:
    """

    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.4)

    prompt = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
    return chain


def get_vector_store(text_chunks):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")


def user_input(user_question):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

    new_db = FAISS.load_local("faiss_index", embeddings)
    docs = new_db.similarity_search(user_question)
    chain = get_conversational_chain()

    response = chain(
        {"input_documents": docs, "question": user_question}, return_only_outputs=True
    )

    print(docs)

    return response["output_text"]



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import Dataset

In [2]:
PDF_DOCS = [
"../input_data/2112_10668.pdf",
]

text = get_pdf_text(PDF_DOCS)
text_chunks = get_text_chunks(text)
get_vector_store(text_chunks)


In [3]:
user_input("What are the two tasks used to examine XGLM's behavior in the context of responsible use of large scale language models?")

  warn_deprecated(


[Document(page_content='considerably when the training set class distribution is uniform. These results highlight the severeness of\nthemajority label bias issue in the multilingual in-context learning framework.\nD.5 Knowledge Probing\nWe evaluate to what extent our multilingual language model can effectively store factual knowledge in\ndifferent languages. To this end, we evaluate knowledge triplet completion using the mLAMA dataset\n(Kassner et al., 2021), which was translated from the English benchmark LAMA (Petroni et al., 2019)\nusing Google Translate. The data is from TREx (Elsahar et al., 2018) with triples of the format ⟨object,\nrelation, subject⟩. Following the convention of LAMA, triples are converted to templates for querying the\nlanguage model. For example, a triple like ⟨Paris, capital-of, France ⟩is converted to template “Paris is the\ncapital of [MASK] ". While each query in the original mLAMA dataset contains hundreds of candidates\non average, we restrict it to thre

'Hate speech detection and Occupation Identification'

In [None]:
text_chunks = get_text_chunks(text)

In [None]:
questions = [
    "What are the two tasks used to examine XGLM's behavior in the context of responsible use of large scale language models?",
    "What is the purpose of the occupation identification task in the study of XGLM's behavior?",
    "What is the scope of the multilingual dataset CC100-XL used for training language models in terms of time coverage and language diversity?"
]

ground_truths = [
    "The two tasks used to examine XGLM's behavior are hate speech detection, which tests the model's ability to identify hateful and offensive text, and occupation identification, which studies the model's performance disparity between different gender groups in identifying occupations.",
    "The purpose of the occupation identification task is to study gender bias in language models by analyzing their performance disparity between different gender groups on the task of identifying a person's occupation from their bios.",
    "The CC100-XL dataset covers 68 Common Crawl snapshots from Summer 2013 to March/April 2020 and includes 134 languages. It is a significantly larger multilingual dataset with a corpus of 8.4 TB and 1.9 trillion tokens, designed to balance language distribution by sampling data from languages with more than 15 billion tokens and 20 million documents."
]


answers = []
contexts = []

# Inference

for query in questions:
    answer = user_input(query)
    answers.append(answer)

    