In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ['COHERE_APi_KEY'] = os.getenv('COHERE_API_KEY')

In [3]:
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_cohere import CohereEmbeddings
from langchain_chroma import Chroma

### Loading Files

In [8]:
def load_documents(docs_path="docs"):
    """Load all text files from doc directory"""
    print(f"Loading documents from {docs_path}...")

    # Check if docs directory exists
    if not os.path.exists(docs_path):
        raise FileNotFoundError(f"The directory {docs_path} does not exist. Please create it and add company files.")
    
    loader = DirectoryLoader(
        path=docs_path,
        glob="*.txt",
        loader_cls=TextLoader
    )

    documents = loader.load()

    if len(documents) == 0:
        raise FileNotFoundError(f"No .txt file found in {docs_path}. Please add your company documents.")
    
    for i,doc in enumerate(documents[:2]): # For showing first two documents
        print(f"\n Document {i+1}:")
        print(f"Source: {doc.metadata['source']}")
        print(f"Content Length: {len(doc.page_content)} characters.")
        print(f"Content Preview: {doc.page_content[:100]}...")
        print(f"Metadata: {doc.metadata}")
    return documents

In [7]:
documents = load_documents()
documents

Loading documents from docs...

 Document 1:
Source: docs/Microsoft.txt
Content Length: 201014 characters.
Content Preview: ﻿Microsoft
Microsoft Corporation is an American multinational Microsoft Corporation
corporation and ...
Metadata: {'source': 'docs/Microsoft.txt'}

 Document 2:
Source: docs/Google.txt
Content Length: 232201 characters.
Content Preview: ﻿Google
Google LLC (/ˈɡuːɡəl/ ⓘ , GOO-gəl) is an Google LLC
American multinational corporation and t...
Metadata: {'source': 'docs/Google.txt'}


 Document(metadata={'source': 'docs/Google.txt'}, page_content='\ufeffGoogle\nGoogle LLC (/ˈɡuːɡəl/ ⓘ , GOO-gəl) is an Google LLC\nAmerican multinational corporation and technology\ncompany focusing on online advertising, search engine\ntechnology, cloud computing, computer software,\nquantum computing, e-commerce, consumer\nelectronics, and artificial intelligence (AI).[9] It has\nbeen referred to as "the most powerful company in the The Google logo used since 2015\nworld" by the BBC[10] and is one of the world\'s most\nvaluable brands.[11][12][13] Google\'s parent company,\nAlphabet Inc., is one of the five Big Tech companies\nalongside Amazon, Apple, Meta, and Microsoft.\n\nGoogle was founded on September 4, 1998, by\nAmerican computer scientists Larry Page and Sergey\nBrin. Together, they own about 14% of its publicly\nlisted shares and control 56% of its stockholder voting\npower through super-voting stock. The company went\npublic via an initial public offering (IPO) in 2004. In 

In [9]:
documents

 Document(metadata={'source': 'docs/Google.txt'}, page_content='\ufeffGoogle\nGoogle LLC (/ˈɡuːɡəl/ ⓘ , GOO-gəl) is an Google LLC\nAmerican multinational corporation and technology\ncompany focusing on online advertising, search engine\ntechnology, cloud computing, computer software,\nquantum computing, e-commerce, consumer\nelectronics, and artificial intelligence (AI).[9] It has\nbeen referred to as "the most powerful company in the The Google logo used since 2015\nworld" by the BBC[10] and is one of the world\'s most\nvaluable brands.[11][12][13] Google\'s parent company,\nAlphabet Inc., is one of the five Big Tech companies\nalongside Amazon, Apple, Meta, and Microsoft.\n\nGoogle was founded on September 4, 1998, by\nAmerican computer scientists Larry Page and Sergey\nBrin. Together, they own about 14% of its publicly\nlisted shares and control 56% of its stockholder voting\npower through super-voting stock. The company went\npublic via an initial public offering (IPO) in 2004. In 

### Text Splitting (Chunks)

In [10]:
def split_documents(documents, chunk_size=800, chunk_overlap = 0):
    """Split documents into smaller chunks with overlap"""
    print("Splitting documents into chunks...")

    text_splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    chunks = text_splitter.split_documents(documents)

    if chunks:
        for i, chunk in enumerate(chunks[:5]):
            print(f"\n --- Chunk {i+1} ---")
            print(f"Source: {chunk.metadata['source']}")
            print(f"Length: {len(chunk.page_content)} characters.")
            print(f"Content:\n{chunk.page_content}")
            print("-" * 50)
        if len(chunks)>5:
            print(f"\n... and {len(chunks)-5} more chunks.")
    return chunks

In [12]:
chunks = split_documents(documents)

Created a chunk of size 1436, which is longer than the specified 800
Created a chunk of size 924, which is longer than the specified 800
Created a chunk of size 815, which is longer than the specified 800
Created a chunk of size 1039, which is longer than the specified 800
Created a chunk of size 1078, which is longer than the specified 800
Created a chunk of size 1043, which is longer than the specified 800
Created a chunk of size 880, which is longer than the specified 800
Created a chunk of size 1019, which is longer than the specified 800
Created a chunk of size 1068, which is longer than the specified 800
Created a chunk of size 1211, which is longer than the specified 800
Created a chunk of size 959, which is longer than the specified 800
Created a chunk of size 888, which is longer than the specified 800
Created a chunk of size 864, which is longer than the specified 800
Created a chunk of size 820, which is longer than the specified 800
Created a chunk of size 1450, which is lo

Splitting documents into chunks...

 --- Chunk 1 ---
Source: docs/Microsoft.txt
Length: 541 characters.
Content:
﻿Microsoft
Microsoft Corporation is an American multinational Microsoft Corporation
corporation and technology conglomerate
headquartered in Redmond, Washington.[2] Founded
in 1975, the company became influential in the rise of
personal computers through software like Windows,
and the company has since expanded to Internet Logo used since 2012
services, cloud computing, video gaming and other
fields. Microsoft is the largest software maker, one of
the most valuable public U.S. companies,[a] and one
of the most valuable brands globally.
--------------------------------------------------

 --- Chunk 2 ---
Source: docs/Microsoft.txt
Length: 778 characters.
Content:
Microsoft was founded by Bill Gates and Paul Allen
to develop and sell BASIC interpreters for the Altair
8800. It rose to dominate the personal computer
operating system market with MS-DOS in the mid-
1980s, followed