In [1]:
# Load web page
import argparse

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embed and store
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings, OpenAIEmbeddings
from langchain.embeddings import OllamaEmbeddings # We can also try Ollama embeddings

from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.indexes import VectorstoreIndexCreator

from langchain_qdrant import QdrantVectorStore




loader = PyPDFLoader('../..//data/raw/motor_neuron_disease.pdf')
data = loader.load()

# Split into chunks 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
all_splits = text_splitter.split_documents(data)
print(f"Split into {len(all_splits)} chunks")
# vectorstore = Chroma.from_documents(documents=all_splits[:5], embedding=OllamaEmbeddings(model='phi3'))

# qdrant = QdrantVectorStore.from_documents(
#     all_splits[:5],
#     OllamaEmbeddings(model='phi3'),
#     path="local_qdrant",
#     collection_name="my_documents",
# )



Split into 225 chunks


In [3]:
from langchain_community.llms import Ollama
llm = Ollama()

In [5]:
llm.invoke("hi")

OllamaEndpointNotFoundError: Ollama call failed with status code 404. Maybe your model is not found and you should pull the model with `ollama pull llama2`.

In [2]:
qdrant = QdrantVectorStore.from_existing_collection(
    embedding=OllamaEmbeddings(model='phi3'),
    collection_name="my_documents",
    path='local_qdrant'

)


In [3]:
retriever = qdrant.as_retriever()

In [4]:
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter

prompt = hub.pull("rlm/rag-prompt")

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [7]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="phi3",
    temperature=0,
    # other params...
)

# llm = ChatOllama(base_url= 'http://localhost:11434', model="phi3")

In [8]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is the name of the book?")


' The name of the book is "Motor neurone disease: the use of non-invasive ventilation in the management of motor neurone disease - NICE clinical guideline CG105."'

In [1]:
# Load web page
import argparse

from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embed and store
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings, OpenAIEmbeddings
from langchain.embeddings import OllamaEmbeddings # We can also try Ollama embeddings

from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.indexes import VectorstoreIndexCreator

from langchain_qdrant import QdrantVectorStore




loader = Docx2txtLoader('../..//data/raw/podcast_transcript.docx')
data = loader.load()

# Split into chunks 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
all_splits = text_splitter.split_documents(data)
print(f"Split into {len(all_splits)} chunks")
# vectorstore = Chroma.from_documents(documents=all_splits[:5], embedding=OllamaEmbeddings(model='phi3'))

qdrant = QdrantVectorStore.from_documents(
    all_splits,
    OllamaEmbeddings(model='llama3'),
    path="local_qdrant_podcast",
    collection_name="podcast2",
)



Split into 34 chunks


In [1]:
from langchain_qdrant import QdrantVectorStore
from langchain.embeddings import OllamaEmbeddings

qdrant = QdrantVectorStore.from_existing_collection(collection_name="podcast2", embedding=OllamaEmbeddings( model='llama3'), path="local_qdrant_podcast" )
retriever = qdrant.as_retriever()

In [20]:
# retriever = qdrant.as_retriever()
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter

prompt = hub.pull("rlm/rag-prompt")
print(prompt)
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="llama3",
    temperature=0,
    
    # other params...
)
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

llm.invoke("hello")
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("Give me a title for the podcast episode you just received?")



input_variables=['context', 'question'] metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]


AIMessage(content="Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?", response_metadata={'model': 'llama3', 'created_at': '2024-07-28T19:46:41.66811425Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 6722614127, 'load_duration': 43087583, 'prompt_eval_count': 11, 'prompt_eval_duration': 2849224000, 'eval_count': 26, 'eval_duration': 3827461000}, id='run-b7920a1f-6d74-4d37-8d34-4271bead54de-0', usage_metadata={'input_tokens': 11, 'output_tokens': 26, 'total_tokens': 37})

In [5]:
rag_chain.invoke("from the podcast, create five key takeaway that i can add to a linkeedIn podcast.")

'Based on the retrieved context, it seems that the conversation is about data governance, quality, and analysis. Here are some key points:\n\n* Data stewards are responsible for ensuring data quality and improving it.\n* The value derived from data is important to consider.\n* SQL is necessary for data analysis, but not everyone needs to be an expert in it.\n* The team has a standardized set of analyses that they run for all countries using Python scripts.\n\nSome potential questions that could be answered based on this context include:\n\n* What are the responsibilities of a data steward?\n* How can data quality be improved?\n* What is the value of data analysis in a business or organization?\n\nPlease let me know if you have any specific question you would like me to answer!'

In [None]:
prompt = """
I want to know the title 
I want to have a linkedin post 
i want to have a key takeaway for people who don't have time to watch the episode 
I want to summarize chapters based on time

"""

In [6]:
from langchain_community.document_loaders import UnstructuredFileIOLoader
import io

In [13]:
with open("../../data/raw/podcast_transcript.docx", "rb") as file:
    file_content = file.read()
    loader = UnstructuredFileIOLoader(io.BytesIO(file_content))
    docs = loader.lazy_load()
    doc = next(docs)



ImportError: failed to find libmagic.  Check your installation

In [11]:
loader.lazy_load()

<generator object UnstructuredBaseLoader.lazy_load at 0x31ad9d6d0>

In [None]:
# trying 

# trying brute force chunking

In [22]:
from typing import List, Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert at identifying important keynotes in a podcast conversation. "
            "Only extract important and relevant keypoints that might be interesting for business people and aspiring data scientists. Extract nothing if no important information can be found in the text.",
        ),
        ("human", "{text}"),
    ]
)


class Keynotes(BaseModel):
    """Information about a the important keynotes from the podcast"""

    description: str = Field(
        ..., description="what is the important takeaway ?"
    )
    evidence: str = Field(
        ...,
        description="Repeat in verbatim the sentence(s) from which the time of the conversation and actual text was taken",
    )

class ExtractionData(BaseModel):
    """Extracted information about key notes from the podcast."""

    key_developments: List[Keynotes]



In [24]:
from langchain_community.llms import Ollama

llm = Ollama(
    model='llama3'
    # other params...
)

extractor = prompt | llm

In [31]:

from langchain.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


loader = Docx2txtLoader('../..//data/raw/podcast_transcript.docx')
data = loader.load()

# Split into chunks 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100000, chunk_overlap=20)
all_splits = text_splitter.split_documents(data)
print(f"Split into {len(all_splits)} chunks")

Split into 1 chunks


In [32]:
all_splits

[Document(metadata={'source': '../..//data/raw/podcast_transcript.docx'}, page_content="🎧 Data Tales Recording Session 🎙️-20240617_154331-Meeting Recording\n\nJune 17, 2024, 1:43PM\n\n1h 9m 41s\n\n\nBaghdadlian, Serop started transcription\n\n\nBaghdadlian, Serop   0:03\nAlright, so I will just start recording.\n\n\nIllig, Selma   0:03\nIt's gonna be.\n\n\nBaghdadlian, Serop   0:06\nBy the way, and at some point we would just start, so don't worry, it doesn't start immediately.\nSo from my experience, I know I sent you a structure where we say like introduction in the beginning.\n\n\nIllig, Selma   0:12\nOK.\n\n\nBaghdadlian, Serop   0:19\nOh, I lost you again.\nWait, wait.\n\n\nIllig, Selma   0:21\nNo.\n\n\nBaghdadlian, Serop   0:22\nNo, you're here.\nPerfect.\n\n\nIllig, Selma   0:23\nOK, I'm here.\n\n\nBaghdadlian, Serop   0:23\nUh, how is?\nHow is the background?\nBy the way, how's your background?\n\n\nIllig, Selma   0:27\nI mean, it's just a should I unblurred I can do it?\n\n\nB

In [33]:
# Limit just to the first 3 chunks
# so the code can be re-run quickly
first_few = all_splits[:1]

extractions = extractor.batch(
    [{"text": text} for text in first_few],
    {"max_concurrency": 5},  # limit the concurrency by passing max concurrency!
)

In [34]:
extractions[0]

"I've extracted the important keynotes from this podcast conversation:\n\n**Keynote 1:** The field of data science is becoming increasingly fluid, and it's not just for people with traditional backgrounds. Anyone can learn and transition into a career in data science.\n\n**Keynote 2:** Soft skills are crucial in data science. You need to be able to understand what others want or push back if something doesn't make sense. It's not just about hacking code or developing software.\n\n**Keynote 3:** Not all engineers need to be experts in complex algorithms. Many can focus on collaborating with teams, developing new software features, and optimizing existing ones.\n\n**Keynote 4:** You don't need to be an expert in everything to achieve your goals. Collaboration is key, and it's okay to focus on specific areas or build up skills over time.\n\nThese keynotes highlight the importance of soft skills, collaboration, and adaptability in a career in data science."

In [17]:
key_developments = []

for extraction in extractions:
    key_developments.extend(extraction.key_developments)

key_developments[:10]

AttributeError: 'str' object has no attribute 'key_developments'