In [49]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, CSVLoader, Docx2txtLoader
from pathlib import Path
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from itertools import combinations
import numpy as np
from langchain.memory import ConversationSummaryBufferMemory,ConversationBufferMemory

In [25]:
LOCAL_VECTOR_STORE_DIR = Path('./data')

In [11]:
def langchain_document_loader(TMP_DIR):
    """
    Load documents from the temporary directory (TMP_DIR). 
    Files can be in txt, pdf, CSV or docx format.
    """

    documents = []

    # txt_loader = DirectoryLoader(
    #     TMP_DIR.as_posix(), glob="**/*.txt", loader_cls=TextLoader, show_progress=True
    # )
    # documents.extend(txt_loader.load())

    # pdf_loader = DirectoryLoader(
    #     TMP_DIR.as_posix(), glob="**/*.pdf", loader_cls=PyPDFLoader, show_progress=True
    # )
    # documents.extend(pdf_loader.load())

    # csv_loader = DirectoryLoader(
    #     TMP_DIR.as_posix(), glob="**/*.csv", loader_cls=CSVLoader, show_progress=True,
    #     loader_kwargs={"encoding":"utf8"}
    # )
    # documents.extend(csv_loader.load())

    doc_loader = DirectoryLoader(
        TMP_DIR.as_posix(),
        glob="**/*.docx",
        loader_cls=Docx2txtLoader,
        show_progress=True,
    )
    documents.extend(doc_loader.load())
    return documents

In [12]:
directory_path = 'course reviews'
TMP_DIR = Path(directory_path)
documents = langchain_document_loader(TMP_DIR)


100%|██████████| 67/67 [00:00<00:00, 1201.92it/s]


In [20]:
print(documents[0])
print(len(documents))

page_content='Year of study: Senior\n\nReview for 100 level course:\n\nCS100 - Computational Problem Solving\nThis is a very slow and repetitive course. It takes too long to get to the meat of the content and there is a huge lag in between the labs and the lecture content. The labs and assignments were laughably easy which meant the grading ended up being terrible, with ridiculously high means. Would only recommend if someone plans to pursue a CS minor.\n\nGpa: 3.60-4.00' metadata={'source': 'course reviews/Student_10_Course_100.docx'}
67


In [29]:
def select_embedding_model():
    embedding = OllamaEmbeddings(model='nomic-embed-text' , show_progress = True)
    return embedding

embeddings_nomic = select_embedding_model()

In [28]:
def create_vectorstore(embeddings,documents,vectorstore_name):
    """Create a Chroma vector database."""
    persist_directory = (LOCAL_VECTOR_STORE_DIR.as_posix() + "/" + vectorstore_name)
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    return vector_store

In [32]:

sentences = ["I like pets.",
             "Pets bring joy to our lives.",
             "Langchain is a framework for developing applications powered by LLMs."]
# 1. Calculate embedding vectors
embedding_vectors = [embeddings_nomic.embed_query(sentence) for sentence in sentences]

for combination in list(combinations(range(len(sentences)),2)):
    # 2. Calculate similarity using dot product from numpy:
    dot_prodduct = round(np.dot(embedding_vectors[combination[0]], embedding_vectors[combination[1]]),3)
    print(f"Similarty of sentences {combination}: {dot_prodduct}")

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  1.33it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 54.39it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 45.88it/s]

Similarty of sentences (0, 1): 331.183
Similarty of sentences (0, 2): 120.802
Similarty of sentences (1, 2): 170.24





In [34]:
create_vectorstores = False # change to True to create vectorstores

if create_vectorstores:
    vector_store_nomic = create_vectorstore(embeddings_nomic,documents,"vector_store_nomic")
    print("Vector store created")
    print("")

In [37]:
vector_store_nomic = Chroma(persist_directory = LOCAL_VECTOR_STORE_DIR.as_posix() + "/vector_store_nomic", 
                            embedding_function=embeddings_nomic)
print("vector_store_Ollama:",vector_store_nomic._collection.count(),"chunks.")

vector_store_Ollama: 67 chunks.


In [38]:
def print_documents(docs,search_with_score=False):
    """helper function to print documents."""
    if search_with_score:
        # used for similarity_search_with_score
        print(
            f"\n{'-' * 100}\n".join(
                [f"Document {i+1}:\n\n" + doc[0].page_content +"\n\nscore:"+str(round(doc[-1],3))+"\n" 
                 for i, doc in enumerate(docs)]
            )
        )
    else:
        # used for similarity_search or max_marginal_relevance_search
        print(
            f"\n{'-' * 100}\n".join(
                [f"Document {i+1}:\n\n" + doc.page_content 
                 for i, doc in enumerate(docs)]
            )
        )  

In [43]:
# Get most similar documents - with scores 
# Here, we use Cosine Similarity. So a lower score is better.

query = 'What are some difficult CS courses?'
docs_withScores = vector_store_nomic.similarity_search_with_score(query,k=4)

print_documents(docs_withScores,search_with_score=True)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 19.52it/s]

Document 1:

Year of study: Junior

Review for 200 level course:

CS 202
Data Structures with Sir Ihsan was a challenging yet fun course. I learned a lot and the course challenges your ability to think and rationalize. Learning wise, this course is great. The outline is well defined and you already know the quiz schedule before the semester so that helps you set your schedule before hand. There is no midterm either which helps during the midweek by lessening the burden. The assignments are comparatively easier but still challenging enough. 

Gpa: 3.60-4.00

score:375.421

----------------------------------------------------------------------------------------------------
Document 2:

Year of study: Senior

Review for 100 level course:

CS100 - Computational Problem Solving
This is a very slow and repetitive course. It takes too long to get to the meat of the content and there is a huge lag in between the labs and the lecture content. The labs and assignments were laughably easy which m




In [44]:
query_embeddings = embeddings_nomic.embed_query(query)
docs_embeddings = embeddings_nomic.embed_documents(
    [docs_withScores[i][0].page_content 
     for i in range(len(docs_withScores))
    ]
)

for i in range(len(docs_embeddings)):
    dot_product = round(np.dot(query_embeddings, docs_embeddings[i]),4)
    print(f"Similarty of document_{i} to the query: {dot_product}")

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 14.59it/s]
OllamaEmbeddings: 100%|██████████| 4/4 [00:00<00:00, 24.95it/s]

Similarty of document_0 to the query: 190.1073
Similarty of document_1 to the query: 212.1687
Similarty of document_2 to the query: 208.5819
Similarty of document_3 to the query: 204.6471





# Retriever

In [45]:
def Vectorstore_backed_retriever(vectorstore,search_type="similarity",k=4,score_threshold=None):
    """create a vectorsore-backed retriever
    Parameters: 
        search_type: Defines the type of search that the Retriever should perform.
            Can be "similarity" (default), "mmr", or "similarity_score_threshold"
        k: number of documents to return (Default: 4) 
        score_threshold: Minimum relevance threshold for similarity_score_threshold (default=None)
    """
    search_kwargs={}
    if k is not None:
        search_kwargs['k'] = k
    if score_threshold is not None:
        search_kwargs['score_threshold'] = score_threshold

    retriever = vectorstore.as_retriever(
        search_type=search_type,
        search_kwargs=search_kwargs
    )
    return retriever

In [48]:
# Similarity search
retriever = Vectorstore_backed_retriever(vector_store_nomic,search_type="similarity",k=4)

# Get relevant documents

query = 'What are some intellectually challenging and stimulating courses?'
relevant_docs = retriever.get_relevant_documents(query)

print_documents(relevant_docs)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 16.48it/s]

Document 1:

Year of study: Junior

Review for 200 level course:

CS 202
Data Structures with Sir Ihsan was a challenging yet fun course. I learned a lot and the course challenges your ability to think and rationalize. Learning wise, this course is great. The outline is well defined and you already know the quiz schedule before the semester so that helps you set your schedule before hand. There is no midterm either which helps during the midweek by lessening the burden. The assignments are comparatively easier but still challenging enough. 

Gpa: 3.60-4.00
----------------------------------------------------------------------------------------------------
Document 2:

Year of study: Junior

Review for 400 level course:

CA 437 - Deep Learrning. Intellectually challenging and stimulating. Dl introduces the many used practical applications of different algorithms. The course, up until now, has thoroughly tested our understanding of different concepts and its application in coding. Very g




# Memory Initialization

In [None]:
def create_memory(model_name='gpt-3.5-turbo',memory_max_token=None):
    """Creates a ConversationSummaryBufferMemory for gpt-3.5-turbo
    Creates a ConversationBufferMemory for the other models."""
    
    if model_name=="gpt-3.5-turbo":
        if memory_max_token is None:
            memory_max_token = 1024 # max_tokens for 'gpt-3.5-turbo' = 4096
        memory = ConversationSummaryBufferMemory(
            max_token_limit=memory_max_token,
            llm=ChatOpenAI(model_name="gpt-3.5-turbo",openai_api_key=openai_api_key,temperature=0.1),
            return_messages=True,
            memory_key='chat_history',
            output_key="answer",
            input_key="question"
        )
    else:
        memory = ConversationBufferMemory(
            return_messages=True,
            memory_key='chat_history',
            output_key="answer",
            input_key="question",
        )  
    return memory