This version uses Milvus through Docker Compose so you must have Docker installed to run this notebook (Milvus is spun up via `docker compose up -d` as shown in the block below)

In [51]:
# ! pip install pymilvus milvus langchain sentence-transformers tiktoken octoai-sdk
# docker compose up -d

In [52]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint

In [53]:
from dotenv import load_dotenv
import os

load_dotenv()

OCTOAI_API_TOKEN = os.environ.get("OCTOAI_API_TOKEN")

os.environ["OCTOAI_API_TOKEN"] = os.getenv("OCTOAI_API_TOKEN")

In [54]:
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.\n Instruction:\n{question}\n Response: """
prompt = PromptTemplate.from_template(template)

In [55]:
llm = OctoAIEndpoint(
    endpoint_url="https://text.octoai.run/v1/chat/completions",
    model_kwargs={
        "model": "mixtral-8x7b-instruct-fp16",
        "max_tokens": 128,
        "presence_penalty": 0,
        "temperature": 0.01,
        "top_p": 0.9,
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful twitter analyzer assistant. Keep your responses limited to one short paragraph if possible.",
            },
        ],
    },
)

In [56]:
question = "Give me a tweet about elon musk"

llm_chain = LLMChain(prompt=prompt, llm=llm)

print(llm_chain.invoke(question)["text"])

 "Exciting to see @elonmusk pushing boundaries yet again with his latest innovation. The future of technology is looking brighter than ever! #ElonMusk #Innovation #FutureTechnology"


In [57]:
from langchain_community.embeddings import OctoAIEmbeddings
from langchain_community.vectorstores import Milvus

In [58]:
embeddings = OctoAIEmbeddings(endpoint_url="https://text.octoai.run/v1/embeddings")

In [59]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [60]:
files = os.listdir("./data")

In [61]:
file_texts = []

In [62]:
for file in files:
    if (file != ".DS_Store"):
        with open(f"./data/{file}") as f:
            file_text = f.read()
        
        text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
            encoding_name="cl100k_base", chunk_size=1024, chunk_overlap=256, 
        )
        texts = text_splitter.split_text(file_text)
        for i, chunked_text in enumerate(texts):
            if (len(chunked_text) < 10000):
                name =  "Joe Biden tweeted: " if file == "JoeBiden.txt" else "Donald Trump tweeted: "
                file_texts.append(Document(page_content=name + chunked_text, 
                    metadata={"twitter_user": file.split(".")[0], "chunk_num": i}))
                # file_texts.append(Document(page_content=chunked_text, 
                #     metadata={"twitter_user": file.split(".")[0], "chunk_num": i}))

Created a chunk of size 1248, which is longer than the specified 1024
Created a chunk of size 1251, which is longer than the specified 1024
Created a chunk of size 1094, which is longer than the specified 1024
Created a chunk of size 2223, which is longer than the specified 1024
Created a chunk of size 1212, which is longer than the specified 1024
Created a chunk of size 1068, which is longer than the specified 1024
Created a chunk of size 2034, which is longer than the specified 1024
Created a chunk of size 1256, which is longer than the specified 1024
Created a chunk of size 1089, which is longer than the specified 1024
Created a chunk of size 1101, which is longer than the specified 1024
Created a chunk of size 1539, which is longer than the specified 1024
Created a chunk of size 1096, which is longer than the specified 1024
Created a chunk of size 5866, which is longer than the specified 1024
Created a chunk of size 1600, which is longer than the specified 1024


In [63]:
vector_store = Milvus.from_documents(
    file_texts,
    embedding=embeddings,
    connection_args={"host": "localhost", "port": 19530},
    collection_name="twitter_user"
)

In [64]:
file_texts[0]

Document(page_content="Joe Biden tweeted: content\nRepresentatives of the people took a vote on the floor of the United States Congress to say loudly and clearly:\n\nLove is love. \nRight is right.\nJustice is justice.\nThanks to the Inflation Reduction Act, out-of-pocket expenses for prescription drug prices will be capped at no more than $2,000 a year for Medicare recipients.\nWhen hospitals and community centers are threatened and intimidated because they support LGBTQI+ children and families, we have to speak out. We must stop hate and violence.\nThe American people won.\nBig Pharma lost.\n\nMedicare will have the power to negotiate for lower prescription drug prices.\nWe are making the largest investment ever in combating the climate crisis through the Inflation Reduction Act.\nWe’re building an economy from the bottom up and the middle out. \n\nAn economy with good jobs and good wages for the long run.\nI had some MAJOR ANNOUNCEMENTS the last couple of weeks, too…\n\n✔️ Inflation

In [65]:
retriever = vector_store.as_retriever()

In [66]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)

In [67]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [68]:
chain.invoke("What kind of twitter post does donald trump post")

" Based on the provided context, Donald Trump's Twitter posts often contain attacks on his opponents, claims of his success, and criticisms of policies he disagrees with. He is also criticized in some of Joe Biden's tweets for not releasing his tax returns, having no moral compass, and undermining registered apprenticeships. Trump is also accused of representing an existential threat to the future of the country and being a threat to national security."