This version uses Milvus through Docker Compose so you must have Docker installed to run this notebook (Milvus is spun up via `docker compose up -d` as shown in the block below)

In [None]:
# ! pip install pymilvus milvus langchain sentence-transformers tiktoken octoai-sdk snscrape streamlit
# docker compose up -d

In [158]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint

In [159]:
from dotenv import load_dotenv
import os

load_dotenv()

OCTOAI_API_TOKEN = os.environ.get("OCTOAI_API_TOKEN")

os.environ["OCTOAI_API_TOKEN"] = os.getenv("OCTOAI_API_TOKEN")

In [160]:
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.\n Instruction:\n{question}\n Response: """
prompt = PromptTemplate.from_template(template)

In [161]:
llm = OctoAIEndpoint(
    octoai_api_token=os.getenv("OCTOAI_API_TOKEN"),
    endpoint_url="https://text.octoai.run/v1/chat/completions",
    model_kwargs={
        "model": "nous-hermes-2-mixtral-8x7b-dpo-fp16",
        "max_tokens": 4000,
        "presence_penalty": 0,
        "temperature": 0.1,
        "top_p": 0.92,
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful twitter analyzer assistant. You will be given 2 twitter accounts to analyze based on user prompt. Keep your responses limited to one short paragraph if possible.",
            },
        ],
    },
)

In [162]:
question = "Give me a tweet about elon musk"

llm_chain = LLMChain(prompt=prompt, llm=llm)

print(llm_chain.invoke(question)["text"])

"Just witnessed Elon Musk's latest SpaceX launch - the future of space exploration is truly inspiring! #SpaceX #ElonMusk"


In [163]:
from langchain_community.embeddings import OctoAIEmbeddings
from langchain_community.vectorstores import Milvus, Zilliz

In [164]:
embeddings = OctoAIEmbeddings(endpoint_url="https://text.octoai.run/v1/embeddings")

In [165]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [166]:
files = os.listdir("./data")

In [167]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

file_texts = []

for file in files:
    if (file != ".DS_Store"):
        with open(f"/Users/rhutapea/CodingProjects/TweetAnalyzer/data/{file}") as f:
            file_text = f.read()
        
        # text_splitter = CharacterTextSplitter(
        #     separator="\n",
        #     chunk_size=1000,
        #     chunk_overlap=200,
        #     length_function=len,
        #     is_separator_regex=False,
        # )

        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            encoding_name="cl100k_base",
            chunk_size=2000,
            chunk_overlap=200,
        )

        # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        #     encoding_name="cl100k_base", chunk_size=1024, chunk_overlap=256, 
        # )

        texts = text_splitter.split_text(file_text)
        # texts = text_splitter_new.create_documents([file_text])
        name =  "Joe Biden" if file == "JoeBiden.txt" else "Donald Trump"

        for i, chunked_text in enumerate(texts):
            file_texts.append(Document(page_content=name + " tweeted: " + chunked_text, 
                metadata={"tweet_author": name, "chunk_num": i}))
        

In [None]:
with open('output_file.txt', 'w', encoding='utf-8') as f:
    f.write(str(file_texts))
    # for doc in file_texts:
    #     f.write(doc.page_content + '\n\n')

In [170]:
print("ZILLIZ_ENDPOINT = " + os.getenv("ZILLIZ_ENDPOINT"))
print("ZILLIZ_API_TOKEN = " + os.getenv("ZILLIZ_API_TOKEN"))

vector_store = Zilliz.from_documents(
    file_texts,
    embedding=embeddings,
    connection_args={"uri": os.getenv("ZILLIZ_ENDPOINT"), "token": os.getenv("ZILLIZ_API_TOKEN")},
    collection_name="user_tweets",
    auto_id=True
)

In [None]:
file_texts[0]

In [None]:
retriever = vector_store.as_retriever()

In [None]:
template = """Answer the question based only on the following user provided data:
{context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke("What is the stance of biden in border policy based on his tweet?")