In [1]:
import docx

def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    fulltext_str = ' '.join(fullText)
    return fulltext_str

In [2]:
file_path = r'E:\Others\document_assistant\sample_ML_DOC.docx'

fulltext_str = getText(file_path)
print(fulltext_str)

Supplementary	Material	–	Machine Learning Machine Learning Overview   Machine learning involves using computers and algorithms to process large amounts of data (observations, patient characteristics, and measurements) and identify patterns without explicit human programming.1 The strength of machine learning is its ability to sieve through massive amounts of data to find new information and insights by iteratively improving its model without assumed relationships. Since the methods perform without explicit programming, the results require an inspection from a human expert to determine whether the algorithms are performing as expected. Interpretable machine learning algorithms can simplify this task. Machine learning algorithms can model and provide insights into a very wide range of data, including genomics,2–4 images,5–7 sound recordings,8,9 vital signs,10 and electronic health records data collected in primary,11,12 secondary,13 and tertiary care.14  Machine learning is an umbrella t

##### Cleaning text

In [3]:
import re

def remove_headers_footers(text, header_patterns=None, footer_patterns=None):
    if header_patterns is None:
        header_patterns = [r'^.*Header.*$']
    if footer_patterns is None:
        footer_patterns = [r'^.*Footer.*$']
    for pattern in header_patterns + footer_patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return text.strip()

def remove_special_characters(text, special_chars=None):
    if special_chars is None:
        special_chars = r'[^A-Za-z0-9\s\.,;:\'\"\?\!\-]'
    text = re.sub(special_chars, '', text)
    return text.strip()

def remove_repeated_substrings(text, pattern=r'\.{2,}'):
    text = re.sub(pattern, '.', text)
    return text.strip()

def remove_extra_spaces(text):
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def preprocess_text(text):
    # Remove headers and footers
    text = remove_headers_footers(text)
    # Remove special characters
    text = remove_special_characters(text)
    # Remove repeated substrings like dots
    text = remove_repeated_substrings(text)
    # Remove extra spaces between lines and within lines
    text = remove_extra_spaces(text)
    # Additional cleaning steps can be added here
    return text.strip()


cleaned_text = preprocess_text(fulltext_str)
print(cleaned_text)

Supplementary Material Machine Learning Machine Learning Overview Machine learning involves using computers and algorithms to process large amounts of data observations, patient characteristics, and measurements and identify patterns without explicit human programming.1 The strength of machine learning is its ability to sieve through massive amounts of data to find new information and insights by iteratively improving its model without assumed relationships. Since the methods perform without explicit programming, the results require an inspection from a human expert to determine whether the algorithms are performing as expected. Interpretable machine learning algorithms can simplify this task. Machine learning algorithms can model and provide insights into a very wide range of data, including genomics,24 images,57 sound recordings,8,9 vital signs,10 and electronic health records data collected in primary,11,12 secondary,13 and tertiary care.14 Machine learning is an umbrella term, cons

#### Text chunking


In [4]:
# def chunk_document(document, chunk_size):
#     return [document[i:i+chunk_size] for i in range(0, len(document), chunk_size)]

# text = "This is a sample document for chunking demonstration."
# chunks = chunk_document(text, 10)
# print(chunks)

In [5]:
# import nltk
# nltk.download('punkt_tab')
# nltk.download('punkt')

# def fixed_size_chunking(text, chunk_size):
#     sentences = nltk.sent_tokenize(text)
#     chunks = []
#     current_chunk = ""
    
#     for sentence in sentences:
#         if len(current_chunk) + len(sentence) <= chunk_size:
#             current_chunk += sentence + " "
#         else:
#             chunks.append(current_chunk.strip())
#             current_chunk = sentence + " "
    
#     if current_chunk:
#         chunks.append(current_chunk.strip())
    
#     return chunks

# text = "This is a sample document. It contains multiple sentences. We will chunk it using fixed-size method."
# chunks = fixed_size_chunking(text, 50)
# print(chunks)

In [6]:
# def recursive_chunking(text, max_chunk_size, min_chunk_size):
#     if len(text) <= max_chunk_size:
#         return [text]
    
#     mid = len(text) // 2
#     left_chunk = text[:mid]
#     right_chunk = text[mid:]
    
#     if len(left_chunk) < min_chunk_size or len(right_chunk) < min_chunk_size:
#         return [text]
    
#     return recursive_chunking(left_chunk, max_chunk_size, min_chunk_size) + \
#            recursive_chunking(right_chunk, max_chunk_size, min_chunk_size)

# text = "This is a longer document for recursive chunking demonstration. It contains multiple sentences and paragraphs."
# chunks = recursive_chunking(text, 50, 20)
# print(chunks)


In [7]:
# import re

# def document_based_chunking(text):
#     paragraphs = re.split(r'\n\s*\n', text)
#     chunks = []
    
#     for paragraph in paragraphs:
#         sentences = nltk.sent_tokenize(paragraph)
#         current_chunk = ""
        
#         for sentence in sentences:
#             if len(current_chunk) + len(sentence) <= 100:
#                 current_chunk += sentence + " "
#             else:
#                 chunks.append(current_chunk.strip())
#                 current_chunk = sentence + " "
        
#         if current_chunk:
#             chunks.append(current_chunk.strip())
    
#     return chunks

# text = """This is the first paragraph of the document.
# It contains multiple sentences.

# This is the second paragraph.
# It also has multiple sentences for demonstration."""

# chunks = document_based_chunking(fulltext_str)
# print(chunks)

In [8]:
# def sentence_based_chunking(text, max_sentences):
#     sentences = nltk.sent_tokenize(text)
#     chunks = []
#     current_chunk = []
    
#     for sentence in sentences:
#         if len(current_chunk) < max_sentences:
#             current_chunk.append(sentence)
#         else:
#             chunks.append(' '.join(current_chunk))
#             current_chunk = [sentence]
    
#     if current_chunk:
#         chunks.append(' '.join(current_chunk))
    
#     return chunks

# text = "This is the first sentence. This is the second one. Here's the third. And a fourth. Let's add a fifth."
# chunks = sentence_based_chunking(fulltext_str, 2)
# print(chunks)

##### Text chunking with langchain

In [9]:
# from dotenv import load_dotenv
# import os

# load_dotenv()  # Load variables from .env file
# api_key = os.getenv("OPENAI_API_KEY")
# print(api_key)

In [10]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [11]:
# text_splitter = RecursiveCharacterTextSplitter(
#     # Set a really small chunk size, just to show.
#     chunk_size=100,
#     chunk_overlap=20,
#     length_function=len,
#     is_separator_regex=False,
# )

In [12]:
# text_splitter = SemanticChunker(OpenAIEmbeddings())

In [13]:
# import spacy
# from langchain.text_splitter import SpacyTextSplitter
# spacy.load('en_core_web_sm')

# text = "Your long document text here. It can be in various languages. SpaCy will handle the linguistic nuances."

# splitter = SpacyTextSplitter(
#     chunk_size=150,
#     chunk_overlap=20
# )

# chunks = splitter.split_text(cleaned_text)

In [14]:
# from langchain.text_splitter import SentenceTransformersTextSplitter
from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter

text = "Your long document text here. This splitter will attempt to create semantically coherent chunks."

splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=20,
    tokens_per_chunk=100
)

chunks = splitter.split_text(cleaned_text)

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
doc_splits = splitter.create_documents(chunks)

In [16]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [17]:
vectorstore = FAISS.from_documents(doc_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))

  vectorstore = FAISS.from_documents(doc_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))


In [18]:
from dotenv import load_dotenv
import os

load_dotenv()  # Load variables from .env file
api_key = os.getenv("GROQ_API_KEY")
print(api_key)

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


In [19]:
from langchain_groq import ChatGroq
llm = ChatGroq(temperature=0, model_name="llama3-8b-8192")

In [26]:
from langchain.chains import ConversationalRetrievalChain

# Query against your own data
chain = ConversationalRetrievalChain.from_llm(llm,
                                              vectorstore.as_retriever(),
                                              return_source_documents=True)

# no chat history passed
result = chain({"question": "What is relationship between Russia and USA", "chat_history": []})
result['answer']

"I don't have information about the relationship between Russia and USA in the context of the provided text. The text appears to be discussing machine learning and artificial intelligence concepts, such as confusion matrices, naive Bayes, and reinforcement learning. It does not mention Russia or the USA. If you're looking for information on the relationship between Russia and the USA, I'd be happy to try and help you with that, but it would require a different context or topic."

In [2]:
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage, SystemMessage

load_dotenv()

model = ChatGroq(model_name="llama3-8b-8192")
messages = [SystemMessage("Translate the following sentence into French"),
            HumanMessage("Hi! How are you doing today?")]

model.invoke(messages)

AIMessage(content="Bonjour ! Comment allez-vous aujourd'hui ?", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 29, 'total_tokens': 39, 'completion_time': 0.008333333, 'prompt_time': 0.011049525, 'queue_time': 0.03741416, 'total_time': 0.019382858}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_6a6771ae9c', 'finish_reason': 'stop', 'logprobs': None}, id='run-98e5f5e6-095a-472c-8790-dd136ac24b65-0', usage_metadata={'input_tokens': 29, 'output_tokens': 10, 'total_tokens': 39})

In [3]:
from langchain_core.prompts import ChatPromptTemplate

system_message = "Translate the following sentence into {language}"

prompt_template = ChatPromptTemplate.from_messages([("system",system_message),("user",{text})])

In [6]:
prompt = prompt_template.invoke({"language":"Spanish",'text':"shoot the ball!"})
prompt

ChatPromptValue(messages=[SystemMessage(content='Translate the following sentence into Spanish', additional_kwargs={}, response_metadata={}), HumanMessage(content='Shoot the ball', additional_kwargs={}, response_metadata={})])

In [7]:
prompt.to_messages()

[SystemMessage(content='Translate the following sentence into Spanish', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Shoot the ball', additional_kwargs={}, response_metadata={})]

In [8]:
response = model.invoke(prompt)
print(response.content)

Dispara el balón.


#### RAG with langchain

In [14]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document as LangchainDocument
from langchain import hub
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_groq import ChatGroq

In [15]:
from dotenv import load_dotenv
import os

load_dotenv()  # Load variables from .env file
api_key = os.getenv("GROQ_API_KEY")

In [16]:
llm = ChatGroq(model="llama3-8b-8192")

In [3]:
pdf_file_path = r'E:\Others\document_assistant\nke-10k-2023.pdf'

loader = PyPDFLoader(pdf_file_path)

documents = loader.load()

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

In [6]:
len(chunks)

460

In [8]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
document_chunk = [LangchainDocument(page_content=chunk.page_content) for chunk in chunks]

vectorstore = FAISS.from_documents(document_chunk, embeddings)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Index chunks
_ = vectorstore.add_documents(documents=chunks)

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vectorstore.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}



In [12]:
# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [18]:
response = graph.invoke({"question": "What do Nike sell?"})
print(response["answer"])

Nike sells athletic footwear, apparel, equipment, accessories, and services.
