# **Installing Requirements**

In [None]:
import keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [None]:
bi_rnn_model = ""
def translate_text(new_tamil_sentence):
    tokenizer.src_lang = "ta_IN"
    encoded_ta = tokenizer(new_tamil_sentence, return_tensors="pt")
    generated_tokens = model.generate(**encoded_ta, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [None]:
import os

def load_data_file(path):
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')

tamil_sentences = load_data_file('/content/data.ta1')
english_sentences = load_data_file('/content/data.en1')

In [None]:
!pip install transformers
!pip install sentence_transformers
!pip install qdrant-client
!pip -q install --upgrade together
!pip -q install langchain
!pip install fitz
!pip install PyMuPDF

Collecting PyMuPDF
  Using cached PyMuPDF-1.24.0-cp310-none-manylinux2014_x86_64.whl (3.9 MB)
Collecting PyMuPDFb==1.24.0 (from PyMuPDF)
  Using cached PyMuPDFb-1.24.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
Installing collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.0 PyMuPDFb-1.24.0


In [None]:
!pip install langchain_together
!pip install PyPDF2

Collecting langchain_together
  Downloading langchain_together-0.0.2.post1-py3-none-any.whl (6.1 kB)
Installing collected packages: langchain_together
Successfully installed langchain_together-0.0.2.post1
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


# ***Constitution Bot***

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from jinja2 import Template
import pandas as pd
from qdrant_client.http import models
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client import QdrantClient
import os
import together
from langchain.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from IPython.display import display, Markdown, Latex
import fitz
import grpc

In [None]:
os.environ["TOGETHER_API_KEY"] = "148521c4088ad416dced465cc144671626b00c860af4e6ebc855953567087d8a"

In [None]:
qdrant_client = QdrantClient(
    "https://86f914bf-2bc6-4b65-88f8-f6814e7e8432.us-east4-0.gcp.cloud.qdrant.io:6333",
    prefer_grpc=True,
    api_key="342KuEX59k02-nwsB5AB8aEIRCaIpDTt43LRBiMABm1HfrdORKco0Q",
)

In [None]:
def get_embeddings(text_batch):
  embedding_model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')
  text_embeds = embedding_model.encode(text_batch)
  return text_embeds

In [None]:
def create_QDrant_collection():
  embedding_model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')
  qdrant_client.recreate_collection(
	collection_name="constitution_docs",
	vectors_config=models.VectorParams(
		size=embedding_model.get_sentence_embedding_dimension(),
		distance=models.Distance.COSINE
	)
)

create_QDrant_collection()

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

model1 = SentenceTransformer('thenlper/gte-base')

In [None]:
model1.get_sentence_embedding_dimension()

768

In [None]:
def process_pdf_and_upload_to_qdrant(pdf_path):
    pdf_document = fitz.open(pdf_path)

    chunks = []
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text = page.get_text("text")
        chunk_size = 2000
        chunks.extend([text[i:i + chunk_size] for i in range(0, len(text), chunk_size)])

    records_to_upload = []
    for idx, chunk in enumerate(chunks):
        vector = model1.encode(chunk).tolist()

        record = models.Record(
            id=idx,
            vector=vector,
            payload={"page_content": chunk}
        )
        records_to_upload.append(record)

    qdrant_client.upload_records(
        collection_name="constitution_docs",
        records=records_to_upload
    )

    pdf_document.close()


In [None]:
from PyPDF2 import PdfReader, PdfWriter

def slice_pdf(input_pdf_path, output_pdf_path, start_page, end_page):
    with open(input_pdf_path, "rb") as input_file:
        reader = PdfReader(input_file)
        writer = PdfWriter()

        start_page = max(0, min(start_page - 1, len(reader.pages)))
        end_page = max(start_page, min(end_page, len(reader.pages)))

        for page_number in range(start_page, end_page):
            writer.add_page(reader.pages[page_number])

        with open(output_pdf_path, "wb") as output_file:
            writer.write(output_file)

input_pdf_path = "/content/IndianConstitution.pdf"
output_pdf_path = "/content/IndianConstitutionSliced.pdf"
start_page = 35
end_page = 55
slice_pdf(input_pdf_path, output_pdf_path, start_page, end_page)


In [None]:
def upload_pdf():
  try:
      process_pdf_and_upload_to_qdrant("/content/IndianConstitutionSliced.pdf")
  except grpc.RpcError as e:
      print(f"Error communicating with Qdrant: {e}")

upload_pdf()

  qdrant_client.upload_records(


In [None]:
hits = qdrant_client.search(
	collection_name="constitution_docs",
	query_vector=model1.encode("Equality of opportunity").tolist(),
	limit=3
)
for hit in hits:
	print(hit.payload, "score:", hit.score)

{'page_content': '39 \n \nExplanation.—For the purposes of this article and article 16, \n"economically weaker sections" shall be such as may be notified by the State \nfrom time to time on the basis of family income and other indicators of \neconomic disadvantage.]  \n16. Equality of opportunity in matters of public employment.—(1) \nThere shall be equality of opportunity for all citizens in matters relating to \nemployment or appointment to any office under the State. \n(2) No citizen shall, on grounds only of religion, race, caste, sex, descent, \nplace of birth, residence or any of them, be ineligible for, or discriminated against \nin respect of, any employment or office under the State. \n(3) Nothing in this article shall prevent Parliament from making any law \nprescribing, in regard to a class or classes of employment or appointment to an \noffice 1[under the Government of, or any local or other authority within, a State \nor Union territory, any requirement as to residence wit

In [None]:
import together

import logging
from typing import Any, Dict, List, Mapping, Optional

from pydantic import Extra, Field, field_validator

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
from langchain.utils import get_from_dict_or_env

class TogetherLLM(LLM):
    """Together large language models."""

    model: str = "togethercomputer/llama-2-70b-chat"
    """model endpoint to use"""

    together_api_key: str = os.environ["TOGETHER_API_KEY"]
    """Together API key"""

    temperature: float = 0.7
    """What sampling temperature to use."""

    max_tokens: int = 512
    """The maximum number of tokens to generate in the completion."""

    class Config:
        extra = 'forbid'

    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that the API key is set."""
        api_key = get_from_dict_or_env(
            values, "together_api_key", "TOGETHER_API_KEY"
        )
        values["together_api_key"] = api_key
        return values

    @property
    def _llm_type(self) -> str:
        """Return type of LLM."""
        return "together"

    def _call(
        self,
        prompt: str,
        **kwargs: Any,
    ) -> str:
        """Call to Together endpoint."""
        together.api_key = self.together_api_key
        output = together.Complete.create(prompt,
                                          model=self.model,
                                          max_tokens=self.max_tokens,
                                          temperature=self.temperature,
                                          )
        text = output['output']['choices'][0]['text']
        return text


In [None]:
from langchain_together import Together

together.api_key = os.environ["TOGETHER_API_KEY"]

llm = Together(
    model="togethercomputer/RedPajama-INCITE-7B-Base",
    temperature=0.7,
    max_tokens=128,
    top_k=1,
    together_api_key="148521c4088ad416dced465cc144671626b00c860af4e6ebc855953567087d8a"
)

embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1")

qdrant = Qdrant(
    client=qdrant_client,
    collection_name="constitution_docs",
    embeddings=embeddings,
)

retriever = qdrant.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)

prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Give Precise Answers. Dont Repeat the Answer.

{context}

Question: {question}
Answer in English:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs=chain_type_kwargs)

# **Machine Translation from Tamil Query to English**

In [None]:
bi_rnn_model = tf.keras.models.load_model('/content/rnn_model.h5')

In [None]:
def tokenize(x):
    x_tk = Tokenizer()
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

In [None]:
from keras.preprocessing.sequence import pad_sequences
def pad(x, length=None):
    if length == None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x,maxlen=length,padding='post')

In [None]:
import numpy as np
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [None]:
def preprocess(x, y):

    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_tamil_sentences, preproc_english_sentences, tamil_tokenizer, english_tokenizer =\
    preprocess(tamil_sentences, english_sentences)

max_tamil_sequence_length = preproc_tamil_sentences.shape[1]
max_english_sequence_length = preproc_english_sentences.shape[1]
tamil_vocab_size = len(tamil_tokenizer.word_index)
english_vocab_size = len(english_tokenizer.word_index)

In [None]:
def translate_text(new_tamil_sentence):
  preprocessed_new_tamil_sentence = pad(tokenize([new_tamil_sentence])[0], max_english_sequence_length).reshape(1, -1, 1)
  predicted_logits = bi_rnn_model.predict(preprocessed_new_tamil_sentence)
  predicted_english_translation = logits_to_text(predicted_logits[0], english_tokenizer)
  return predicted_english_translation

In [None]:
def print_result(english_query, result):
  output_text = f""" ### Translated English Query:
  {english_query}
  ### Question:
  {query}
  ### Answer:
  {result}"""
  return (output_text)

In [None]:
def get_translated_answer(query):
  english_query = translate_text(query)
  result = qa.run(english_query)
  display(Markdown(print_result(english_query, result)))

In [None]:
# Tell me about Equality of opportunity
query = "வாய்ப்பு சமத்துவம் பற்றி சொல்லுங்கள்"
get_translated_answer(query)

 ### Translated English Query:
  Citizenship rights
  ### Question:
  குடியுரிமை உரிமைகள்
  ### Answer:
  

1. Citizenship rights are the rights of the citizens of India.

2. Citizenship rights are the rights of the citizens of India.

3. Citizenship rights are the rights of the citizens of India.

4. Citizenship rights are the rights of the citizens of India.

5. Citizenship rights are the rights of the citizens of India.

6. Citizenship rights are the rights of the citizens of India.

7. Citizenship rights are the rights of the citizens of India.

8. Citizenship rights are the rights of the citizens of India.