In [None]:
# ! pip install pymilvus milvus langchain sentence-transformers tiktoken octoai-sdk snscrape streamlit
# docker compose up -d

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

OCTOAI_API_TOKEN = os.environ.get("OCTOAI_API_TOKEN")

os.environ["OCTOAI_API_TOKEN"] = os.getenv("OCTOAI_API_TOKEN")

In [None]:
llm = OctoAIEndpoint(
    octoai_api_token=os.getenv("OCTOAI_API_TOKEN"),
    endpoint_url="https://text.octoai.run/v1/chat/completions",
    model_kwargs={
        "model": "nous-hermes-2-mixtral-8x7b-dpo-fp16",
        "max_tokens": 4000,
        "presence_penalty": 0,
        "temperature": 0.1,
        "top_p": 0.92,
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful twitter analyzer assistant. You will be given a twitter account tweet to analyze based on user prompt. Keep your responses limited to one short paragraph if possible.",
            },
        ],
    },
)

In [None]:
from langchain_community.embeddings import OctoAIEmbeddings
from langchain_community.vectorstores import Milvus, Zilliz

In [None]:
embeddings = OctoAIEmbeddings(endpoint_url="https://text.octoai.run/v1/embeddings")

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [None]:
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path='newData/tweetAggregated.csv',metadata_columns=["tweet_author", "like_count"])
clean_data = loader.load()

with open('newData/cleaned_data.txt', 'w', encoding='utf-8') as f:
    f.write(str(clean_data))

In [None]:
print("ZILLIZ_ENDPOINT = " + os.getenv("ZILLIZ_ENDPOINT"))
print("ZILLIZ_API_TOKEN = " + os.getenv("ZILLIZ_API_TOKEN"))

vector_store = Zilliz.from_documents(
    clean_data,
    embedding=embeddings,
    connection_args={"uri": os.getenv("ZILLIZ_ENDPOINT"), "token": os.getenv("ZILLIZ_API_TOKEN")},
    collection_name="donald_biden_tweets",
    auto_id=True
)

In [None]:
retriever = vector_store.as_retriever()

In [None]:
vst_new = Zilliz.from_documents(
    clean_data,
    collection_name="biden_tweets",
    embedding=embeddings,
    connection_args={"uri": os.getenv("ZILLIZ_ENDPOINT"), "token": os.getenv("ZILLIZ_API_TOKEN")},
    auto_id=True
)

retriever_new = vst_new.as_retriever()
print(retriever_new)

In [None]:
# Example of getting already initialized vector stores

asd = Zilliz(
    collection_name="donald_biden_tweets",
    embedding_function=embeddings,
    connection_args={"uri": os.getenv("ZILLIZ_ENDPOINT"), "token": os.getenv("ZILLIZ_API_TOKEN")},
)

In [None]:
template = """Answer the question based only on the following user provided data, the data consist of tweets from a single tweet author:
{context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": asd.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke("What is the most liked tweet from biden?")