In [None]:
ORG = "NVIDIA"
REPO = "spark-rapids"
BOTS = ["dependabot[bot]", "GPUtester", "github-actions[bot]"]
import os  # noqa: E402

try:
    OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
except KeyError:
    OPENAI_API_KEY = ""

In [None]:
repo = REPO

In [None]:
    import pandas as pd
    from langchain_experimental.agents import create_pandas_dataframe_agent
    from langchain_openai import OpenAI as OpenAI_langchain
    from langchain_openai import OpenAIEmbeddings
    from pymilvus import MilvusClient

In [None]:
    df_issues = pd.read_parquet(f"{repo}_issue_summary.parquet")
    # drop the issue_text column as context is too large for agent
    # we will use the vector database instead
    df_issues = df_issues.drop(
        columns=[
            "issue_text",
            "LLM_title_subject",
            "label_names",
            "issue_text_tokens",
            "issue_created_at",
            "issue_updated_at",
            "issue_reactions.+1",
            "issue_reactions.-1",
            "issue_reactions.laugh",
            "issue_reactions.hooray",
            "issue_reactions.confused",
            "issue_reactions.heart",
            "issue_reactions.rocket",
            "issue_reactions.eyes",
            "issue_user.login_location_lat",
            "issue_user.login_location_lon",
        ]
    )

In [None]:
    df_issues = df_issues.rename(
        columns={
            "number": f"{repo}_issue_number",
            "title": f"{repo}_issue_title",
            "author_association": f"association_to_{repo}",
            "issue_reactions.total_count": "number_of_reactions_on_issue",
            "n_comments": "number_of_comments",
            "issue_user.login_email": "email",
            "issue_user.login_name": "name",
            "issue_user.login_company": "company",
            "issue_user.login_name_company": "name_company",
            "issue_user.login_location": "location",
            "issue_user.login_followers": "github_followers",
            "comment_reactions.total_count": "number_of_reactions_on_comments",
        }
    )
    print(df_issues["company"].value_counts())

In [None]:
print(df_issues[df_issues["company"] == "bytedance"])

In [None]:
repo = "cudf"

In [None]:
repo.replace('-', '_')

In [None]:
client = MilvusClient(f"./milvus_{repo.replace('-', '_')}.db")

In [None]:
    import pickle
    import pandas as pd
    from langchain_openai import OpenAIEmbeddings
    from pymilvus import MilvusClient

In [None]:
repo = "cudf"

In [None]:
client = MilvusClient(f"./milvus_{repo.replace('-', '_')}.db")

In [None]:
try:
    OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
except KeyError:
    OPENAI_API_KEY = ""

In [None]:
    df = pd.read_parquet(f"{repo}_issue_summary.parquet")
    embeddings_model = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
    embeddings = embeddings_model.embed_documents(
        df["issue_text"].fillna("").values
    )  # ndocs x 1536
    with open(f"{repo}_embeddings.pkl", "wb") as f:
        pickle.dump(embeddings, f)
    data = [
        {
            "id": row["number"],
            "vector": embeddings[i],
            "text": row["issue_text"],
            "subject": row["LLM_title_subject"],
        }
        for i, row in df.iterrows()
    ]

In [None]:
    client.create_collection(
        collection_name=f"{repo.replace('-', '_')}_issue_text", dimension=1536
    )

In [None]:
_ = client.insert(collection_name=f"{repo.replace('-', '_')}_issue_text", data=data)