In [None]:
%pip install --upgrade pip
%pip install boto3 --force-reinstall --quiet
%pip install botocore --force-reinstall --quiet
%pip install langchain --force-reinstall --quiet

In [None]:
# ! pip install langchain_community
# ! pip install chromadb
# ! pip install langchain_aws
# !pip install rouge_score

In [None]:
# boto3_bedrock = boto3.client('bedrock')
# [models['modelId'] for models in boto3_bedrock.list_foundation_models()['modelSummaries']]


In [None]:
#Importing the libraries
import boto3
import pprint
from botocore.client import Config
import json
from langchain_community.embeddings import BedrockEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import re
from bs4 import BeautifulSoup

Initiailising the llm and embedding models

In [None]:
pp = pprint.PrettyPrinter(indent=2)
session = boto3.session.Session()
region = session.region_name
bedrock_config = Config(connect_timeout=120, read_timeout=120, retries={'max_attempts': 0})
bedrock_client = boto3.client('bedrock-runtime', region_name = region)
bedrock_embeddings = BedrockEmbeddings(model_id = 'amazon.titan-embed-text-v1',)
langchain_llm_claude_sonnet = ChatBedrock(model_id = "anthropic.claude-3-sonnet-20240229-v1:0")
langchain_llm_meta_llama3_70b = ChatBedrock(model_id = "meta.llama3-70b-instruct-v1:0")


#print(region)

In [None]:
def get_embeddings(text,bedrock_runtime):
    # Define prompt and model parameters
    body = json.dumps({
        "inputText": text,
    })

    model_id = 'amazon.titan-embed-text-v1' #look for embeddings in the modelID
    accept = 'application/json'
    content_type = 'application/json'

    # Invoke model
    response = bedrock_runtime.invoke_model(
        body=body,
        modelId=model_id,
        accept=accept,
        contentType=content_type
    )

    # Print response
    response_body = json.loads(response['body'].read())
    embedding = response_body.get('embedding')

    #Print the Embedding

    return embedding


---

# Indexing : Integrating document and corresponding embeddings and storage to vector db

In [None]:
input_file_dir =f"/data/text_files"
index_path     =f"/data/index"

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.indexes.vectorstore import VectorStoreIndexWrapper


from langchain_community.document_loaders import TextLoader
from tqdm.notebook import tqdm
import os


count = 0
vectorstore_chromaDB = None
for file in tqdm(os.listdir(input_file_dir)):
    if file == ".ipynb_checkpoints":
        continue
    count = count + 1
    loader = TextLoader(f"{input_file_dir}/{file}")
    doc = loader.load()
    # embeddings = get_embeddings("hello",bedrock_client)

    if vectorstore_chromaDB:
        vectorstore_chromaDB.add_documents(doc)
    else:
        vectorstore_chromaDB = Chroma.from_documents(doc,
                              bedrock_embeddings,
                              persist_directory = f"{index_path}/index_rag")




---

# Retreival : Performing basic RAG

In [None]:


def remove_time_stamps(text):
    # Regular expression pattern to match time stamps in the format "00:24:40"
    pattern = r'\d{2}:\d{2}:\d{2}'

    # Remove time stamps using the regular expression substitution
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

def post_process(text):
    soup =  BeautifulSoup(text) #parse html with BeautifulSoup
    start_time = soup.find('start_time').text #tag of interest <td>Example information</td>
    end_time = soup.find('end_time').text #tag of interest <td>Example information</td>
    file_name = soup.find('file_name').text
    answer = BeautifulSoup(text, "lxml").text
    answer = remove_time_stamps(answer)
    return answer,start_time,end_time,file_name




In [None]:
query = "Can be defined accordingly for testing"

In [None]:


prompt_template = """

Human: You are a educational tutorial AI system, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Also provide the start time, end time and the file name in the start_time, end_time and file_name tags respectively.

<context>
{context}
</context>

<question>
{question}
</question>

Don't Add any additional text like 'but no specific details about what a Decision Tree is are provided in the given context'
Assistant: Answer in answer tag and start time, end time and file name in the start_time, end_time and file_name tags are as follows:"""




PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)


vectorstore_chromaDB = Chroma(persist_directory=f"{index_path}/index_rag", embedding_function=bedrock_embeddings)

qa = RetrievalQA.from_chain_type(
    llm=langchain_llm_claude_sonnet,
    chain_type="stuff",
    retriever=vectorstore_chromaDB.as_retriever(
        search_type="similarity", search_kwargs={"k": 3}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)
answer = qa({"query": query})
# print(answer["result"])

print(post_process(answer["result"]))

from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)


# if query == question_answer["question"]:

#     scores = scorer.score(question_answer["answer"],answer["result"])
#     st.write(f"Rouge score - {scores['rouge1']}")


In [None]:
qa = RetrievalQA.from_chain_type(
    llm=langchain_llm_meta_llama3_70b,
    chain_type="stuff",
    retriever=vectorstore_chromaDB.as_retriever(
        search_type="similarity", search_kwargs={"k": 3}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)
answer = qa({"query": query})
print(answer["result"])

print(post_process(answer["result"]))

In [None]:
print(answer["result"])

In [None]:
# qa = RetrievalQA.from_chain_type(
#     llm=langchain_llm_mistral_large_2,
#     chain_type="stuff",
#     retriever=vectorstore_chromaDB.as_retriever(
#         search_type="similarity", search_kwargs={"k": 3}
#     ),
#     return_source_documents=True,
#     chain_type_kwargs={"prompt": MISTRAL_PROMPT}
# )
# answer = qa({"query": query})
# print(answer["result"])



In [None]:
# qa = RetrievalQA.from_chain_type(
#     llm=langchain_llm_cohere_command_r_plus,
#     chain_type="stuff",
#     retriever=vectorstore_chromaDB.as_retriever(
#         search_type="similarity", search_kwargs={"k": 3}
#     ),
#     return_source_documents=True,
#     chain_type_kwargs={"prompt": PROMPT}
# )
# answer = qa({"query": query})
# print(answer["result"])




In [None]:
s = "<start_time>00:00:40</start_time> <end_time>00:01:20</end_time> <file_name>Data_structures-Introduction_to_graphs-_YouTube_audio_chunk_40000.wav</file_name>"

In [None]:
soup =  BeautifulSoup(s)

file_name = soup.find('file_name').text

In [None]:
file_name

In [None]:
def post_process(text):

    soup =  BeautifulSoup(text) #parse html with BeautifulSoup
    start_time = soup.find('start_time').text #tag of interest <td>Example information</td>
    end_time = soup.find('end_time').text #tag of interest <td>Example information</td>
    file_name = soup.find('file_name').text
    answer = BeautifulSoup(text, "lxml").text
    answer = remove_time_stamps(answer)
    return answer,start_time,end_time,file_name

In [None]:
post_process(""" <answer> A data structure is a way to store and organize data. The given context mentions that "data structures are ways to store and organise data and for different kinds of data, we use different kinds of data structures." </answer>

<start_time>00:00:40</start_time> <end_time>00:01:20</end_time> <file_name>Data_structures-Introduction_to_graphs-_YouTube_audio_chunk_40000.wav</file_name>

""")