In [27]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, CSVLoader, Docx2txtLoader
from pathlib import Path
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from itertools import combinations
import numpy as np
from langchain.memory import ConversationSummaryBufferMemory,ConversationBufferMemory, ConversationBufferWindowMemory
from langchain.prompts import PromptTemplate
from langchain.chains import create_retrieval_chain, RetrievalQA, ConversationalRetrievalChain, RetrievalQAWithSourcesChain
import google.generativeai as genai
from langchain_community.llms import HuggingFaceHub
import gradio as gr

In [28]:
LOCAL_VECTOR_STORE_DIR = Path('./data')
GOOGLE_API_KEY = "AIzaSyDzH-2ryBkzhrWtKJ6NmBRLREpmLsf8FqE"

In [29]:
def langchain_document_loader(TMP_DIR):
    """
    Load documents from the temporary directory (TMP_DIR). 
    Files can be in txt, pdf, CSV or docx format.
    """

    documents = []

    # txt_loader = DirectoryLoader(
    #     TMP_DIR.as_posix(), glob="**/*.txt", loader_cls=TextLoader, show_progress=True
    # )
    # documents.extend(txt_loader.load())

    # pdf_loader = DirectoryLoader(
    #     TMP_DIR.as_posix(), glob="**/*.pdf", loader_cls=PyPDFLoader, show_progress=True
    # )
    # documents.extend(pdf_loader.load())

    # csv_loader = DirectoryLoader(
    #     TMP_DIR.as_posix(), glob="**/*.csv", loader_cls=CSVLoader, show_progress=True,
    #     loader_kwargs={"encoding":"utf8"}
    # )
    # documents.extend(csv_loader.load())

    doc_loader = DirectoryLoader(
        TMP_DIR.as_posix(),
        glob="**/*.docx",
        loader_cls=Docx2txtLoader,
        show_progress=True,
    )
    documents.extend(doc_loader.load())
    return documents

In [30]:
directory_path = 'course reviews'
TMP_DIR = Path(directory_path)
documents = langchain_document_loader(TMP_DIR)


100%|██████████| 67/67 [00:00<00:00, 1786.48it/s]


In [31]:
print(documents[0])
print(len(documents))

page_content='Year of study: Senior\n\nReview for 100 level course:\n\nCS100 - Computational Problem Solving\nThis is a very slow and repetitive course. It takes too long to get to the meat of the content and there is a huge lag in between the labs and the lecture content. The labs and assignments were laughably easy which meant the grading ended up being terrible, with ridiculously high means. Would only recommend if someone plans to pursue a CS minor.\n\nGpa: 3.60-4.00' metadata={'source': 'course reviews/Student_10_Course_100.docx'}
67


In [32]:
def select_embedding_model():
    embedding = OllamaEmbeddings(model='nomic-embed-text')
    return embedding

embeddings_nomic = select_embedding_model()

In [33]:
def create_vectorstore(embeddings,documents,vectorstore_name):
    """Create a Chroma vector database."""
    persist_directory = (LOCAL_VECTOR_STORE_DIR.as_posix() + "/" + vectorstore_name)
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    return vector_store

In [34]:

sentences = ["I like pets.",
             "Pets bring joy to our lives.",
             "Langchain is a framework for developing applications powered by LLMs."]
# 1. Calculate embedding vectors
embedding_vectors = [embeddings_nomic.embed_query(sentence) for sentence in sentences]

for combination in list(combinations(range(len(sentences)),2)):
    # 2. Calculate similarity using dot product from numpy:
    dot_prodduct = round(np.dot(embedding_vectors[combination[0]], embedding_vectors[combination[1]]),3)
    print(f"Similarty of sentences {combination}: {dot_prodduct}")

Similarty of sentences (0, 1): 331.183
Similarty of sentences (0, 2): 120.802
Similarty of sentences (1, 2): 170.24


In [35]:
create_vectorstores = False # change to True to create vectorstores

if create_vectorstores:
    vector_store_nomic = create_vectorstore(embeddings_nomic,documents,"vector_store_nomic")
    print("Vector store created")
    print("")

In [36]:
vector_store_nomic = Chroma(persist_directory = LOCAL_VECTOR_STORE_DIR.as_posix() + "/vector_store_nomic", 
                            embedding_function=embeddings_nomic)
print("vector_store_Ollama:",vector_store_nomic._collection.count(),"chunks.")

vector_store_Ollama: 67 chunks.


In [37]:
def print_documents(docs,search_with_score=False):
    """helper function to print documents."""
    if search_with_score:
        # used for similarity_search_with_score
        print(
            f"\n{'-' * 100}\n".join(
                [f"Document {i+1}:\n\n" + doc[0].page_content +"\n\nscore:"+str(round(doc[-1],3))+"\n" 
                 for i, doc in enumerate(docs)]
            )
        )
    else:
        # used for similarity_search or max_marginal_relevance_search
        print(
            f"\n{'-' * 100}\n".join(
                [f"Document {i+1}:\n\n" + doc.page_content 
                 for i, doc in enumerate(docs)]
            )
        )  

In [38]:
# Get most similar documents - with scores 
# Here, we use Cosine Similarity. So a lower score is better.

query = 'What are some difficult CS courses?'
docs_withScores = vector_store_nomic.similarity_search_with_score(query,k=4)

print_documents(docs_withScores,search_with_score=True)

Document 1:

Year of study: Junior

Review for 200 level course:

CS 202
Data Structures with Sir Ihsan was a challenging yet fun course. I learned a lot and the course challenges your ability to think and rationalize. Learning wise, this course is great. The outline is well defined and you already know the quiz schedule before the semester so that helps you set your schedule before hand. There is no midterm either which helps during the midweek by lessening the burden. The assignments are comparatively easier but still challenging enough. 

Gpa: 3.60-4.00

score:375.421

----------------------------------------------------------------------------------------------------
Document 2:

Year of study: Senior

Review for 100 level course:

CS100 - Computational Problem Solving
This is a very slow and repetitive course. It takes too long to get to the meat of the content and there is a huge lag in between the labs and the lecture content. The labs and assignments were laughably easy which m

In [39]:
query_embeddings = embeddings_nomic.embed_query(query)
docs_embeddings = embeddings_nomic.embed_documents(
    [docs_withScores[i][0].page_content 
     for i in range(len(docs_withScores))
    ]
)

for i in range(len(docs_embeddings)):
    dot_product = round(np.dot(query_embeddings, docs_embeddings[i]),4)
    print(f"Similarty of document_{i} to the query: {dot_product}")

Similarty of document_0 to the query: 190.1073
Similarty of document_1 to the query: 212.1687
Similarty of document_2 to the query: 208.5819
Similarty of document_3 to the query: 204.6471


# Retriever

In [40]:
def Vectorstore_backed_retriever(vectorstore,search_type="similarity",k=4,score_threshold=None):
    """create a vectorsore-backed retriever
    Parameters: 
        search_type: Defines the type of search that the Retriever should perform.
            Can be "similarity" (default), "mmr", or "similarity_score_threshold"
        k: number of documents to return (Default: 4) 
        score_threshold: Minimum relevance threshold for similarity_score_threshold (default=None)
    """
    search_kwargs={}
    if k is not None:
        search_kwargs['k'] = k
    if score_threshold is not None:
        search_kwargs['score_threshold'] = score_threshold

    retriever = vectorstore.as_retriever(
        search_type=search_type,
        search_kwargs=search_kwargs
    )
    return retriever

In [41]:
# Similarity search
retriever = Vectorstore_backed_retriever(vector_store_nomic,search_type="similarity",k=4)

# Get relevant documents

query = 'What are some intellectually challenging and stimulating courses?'
relevant_docs = retriever.get_relevant_documents(query)

print_documents(relevant_docs)

Document 1:

Year of study: Junior

Review for 200 level course:

CS 202
Data Structures with Sir Ihsan was a challenging yet fun course. I learned a lot and the course challenges your ability to think and rationalize. Learning wise, this course is great. The outline is well defined and you already know the quiz schedule before the semester so that helps you set your schedule before hand. There is no midterm either which helps during the midweek by lessening the burden. The assignments are comparatively easier but still challenging enough. 

Gpa: 3.60-4.00
----------------------------------------------------------------------------------------------------
Document 2:

Year of study: Junior

Review for 400 level course:

CA 437 - Deep Learrning. Intellectually challenging and stimulating. Dl introduces the many used practical applications of different algorithms. The course, up until now, has thoroughly tested our understanding of different concepts and its application in coding. Very g

# Instantiating LLM

In [42]:
def instantiate_LLM(api_key,temperature=0.5,top_p=0.95,model_name=None):
    """Instantiate LLM in Langchain.
    Parameters:
        LLM_provider (str): the LLM provider; in ["OpenAI","Google","HuggingFace"]
        model_name (str): in ["gpt-3.5-turbo", "gpt-3.5-turbo-0125", "gpt-4-turbo-preview", 
            "gemini-pro", "mistralai/Mistral-7B-Instruct-v0.2"].            
        api_key (str): google_api_key or openai_api_key or huggingfacehub_api_token 
        temperature (float): Range: 0.0 - 1.0; default = 0.5
        top_p (float): : Range: 0.0 - 1.0; default = 1.
    """
    
  
    llm = HuggingFaceHub(
        repo_id="mistralai/Mistral-7B-Instruct-v0.2", 
        # repo_id=model_name,
        huggingfacehub_api_token=api_key,
        model_kwargs={
            "temperature":temperature,
            "top_p": top_p,
            "do_sample": True,
            "max_new_tokens":1024
        },
    )
    return llm

llm = instantiate_LLM(api_key="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")

# Memory Initialization

In [43]:
def create_memory():
    """Creates a ConversationSummaryBufferMemory for our model
    Creates a ConversationBufferWindowMemory for our models."""
    
    memory = ConversationBufferWindowMemory(
        memory_key="history",
        input_key="question",
        return_messages=True,
        k=3
    )

    return memory

memory = create_memory()

# Creating the Context

In [44]:
memory.save_context(
    {"question": "What can you do?"},
    {"output": "I can answer queries based on the past reviews and course outlines of various courses offered at LUMS."}
)

# Prompt Template

In [45]:
context_qa = """
You are a professional chatbot assistant for helping students at LUMS regarding course selection.

Please follow the following rules:

1. Answer the question in your own words from the context given to you.
2. If you don't know the answer, don't try to make up an answer.
3. If you don't have a course's review or outline, just say that you do not know about this course.
4. If a user enters a course code (e.g. ECON100 or CS370), match it with reviews with that course code. If the user enters a course name (e.g. Introduction to Economics or Database Systems), match it with reviews with that course name.

Context: {context}

You are having a converation with a student at LUMS.

Chat History: {history}

Human: {question}

Assistant123:
"""

prompt = PromptTemplate(
    input_variables=["history", "context", "question"],
    template=context_qa
)

# Putting it all together

In [46]:
genai.configure(api_key=GOOGLE_API_KEY)

In [47]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [48]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    verbose=False,
    return_source_documents=False,
    chain_type_kwargs={
        "prompt": prompt,
        "memory": memory
    },
)


# Interface

In [49]:


def rag_model(query):
    # Your RAG model code here
    result = qa({'query': query})

    # Extract the answer from the result
    answer = result['result']

    # Extract the response from the answer (if needed)
    response = answer.split('Assistant123:')[-1]

    return response

iface = gr.Interface(fn=rag_model, inputs="text", outputs="text", title="RAG Model")
iface.launch()


Running on local URL:  http://127.0.0.1:7864





To create a public link, set `share=True` in `launch()`.


