In [None]:
ORG = "dask"
REPO = "dask"
repo = REPO

In [None]:
    BOTS = ["dependabot[bot]", "GPUtester", "github-actions[bot]"]

    import pandas as pd

    issue_core_columns = [
        "number",
        "title",
        "issue_text",
        "issue_user.login",
        "author_association",
        "label_names",
        # "state",
        # "locked",
        # "milestone",
        "issue_created_at",
        "issue_updated_at",
        "issue_reactions.total_count",
        "issue_reactions.+1",
        "issue_reactions.-1",
        "issue_reactions.laugh",
        "issue_reactions.hooray",
        "issue_reactions.confused",
        "issue_reactions.heart",
        "issue_reactions.rocket",
        "issue_reactions.eyes",
        "n_comments",
    ]
    df_issues = pd.read_csv(f"{repo}_issue_details.csv")[issue_core_columns]
    df_issues = df_issues.loc[~df_issues["issue_user.login"].isin(BOTS)].reset_index(
        drop=True
    )

In [None]:
df_issues

In [None]:
df_issues["number"][4] == 874

In [None]:
874 in df_issues["number"].astype(int)

In [None]:
# Check the data type
print(df_issues["number"].dtype)

# Convert to integers if needed
if df_issues["number"].dtype != 'int64':
    df_issues["number"] = df_issues["number"].astype(int)

# Now check again
print(874 in df_issues["number"].tolist())  # Should return True if 874 is in the column

# Alternatively, ensure there's no whitespace if it's a string
# df_issues["number"] = df_issues["number"].str.strip()


In [None]:
874 in df_issues["number"]

In [None]:
!ls dask_issues/* | wc -l

In [None]:
    import json
    import os
    import pandas as pd

    from tqdm.auto import tqdm


In [None]:
874 in df_issues["number"]

In [None]:
files = sorted(os.listdir(f"{repo}_issues"))
for file in files:
    issue_number_from_json_filename = int(file.split(".")[0][-5:])
    if issue_number_from_json_filename in df_issues["number"].tolist():
        pass
    else:
        print(issue_number_from_json_filename)
        print(file)

In [None]:
len(files)

In [None]:
int(files[0].split(".")[0][-5:]) in df_issues["number"]

In [None]:
ORG = "dask"
REPO = "dask"
BOTS = ["dependabot[bot]", "GPUtester", "github-actions[bot]"]

In [None]:
org: str = ORG
repo: str = REPO

In [None]:
    import os
    import requests
    import json

    from tqdm.auto import tqdm

    output_folder = f"{repo}_issues"
    os.makedirs(output_folder, exist_ok=True)

    headers = {"Authorization": f"token {os.environ['GITHUB_API_TOKEN']}"}

    issues = []
    page = 1

In [None]:
issues_url = f"https://api.github.com/repos/{org}/{repo}/issues?state=open&per_page=100&page={page}"

In [None]:
        response = requests.get(issues_url, headers=headers)
        page_issues = response.json()
        only_issues = [issue for issue in page_issues if "pull_request" not in issue]

In [None]:
only_issues

In [None]:
for issue in only_issues:
    print(issue["user"]["login"])
    print()

In [None]:
    for file in tqdm(sorted(os.listdir(f"{repo}_issues")), "concatenating issues"):
        with open(f"{repo}_issues/{file}", "r") as f:
            data = json.load(f)
        _df = pd.json_normalize(data)

In [None]:
!rm -rf ./milvus_demo.db

In [1]:
from pymilvus import MilvusClient
import numpy as np

client = MilvusClient("./milvus_demo.db")
client.create_collection(
    collection_name="demo_collection",
    dimension=768  # The vectors we will use in this demo has 384 dimensions
)

In [2]:
from pymilvus import model

# If connection to https://huggingface.co/ failed, uncomment the following path
# import os
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# This will download a small embedding model "paraphrase-albert-small-v2" (~50MB).
embedding_fn = model.DefaultEmbeddingFunction()

# Text strings to search from.
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

vectors = embedding_fn.encode_documents(docs)
# The output vector has 768 dimensions, matching the collection that we just created.
print("Dim:", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)

# Each entity has id, vector representation, raw text, and a subject label that we use
# to demo metadata filtering later.
data = [
    {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"}
    for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))



Dim: 768 (768,)
Data has 3 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


In [8]:
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing?"])
# If you don't have the embedding function you can use a fake vector to finish the demo:
# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]

res = client.search(
    collection_name="demo_collection",  # target collection
    data=query_vectors,  # query vectors
    limit=2,  # number of returned entities
    output_fields=["text", "subject"],  # specifies fields to be returned
)

print(res)

data: ["[{'id': 2, 'distance': 0.5859944224357605, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.5118255615234375, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]"] , extra_info: {'cost': 0}


In [6]:
np.array(vectors).shape

(3, 768)

In [7]:
res = client.insert(collection_name="demo_collection", data=data)

print(res)

{'insert_count': 3, 'ids': [0, 1, 2], 'cost': 0}


In [None]:
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

In [None]:
len(docs[0])

In [None]:
vectors = [[ np.random.uniform(-1, 1) for _ in range(384) ] for _ in range(len(docs)) ]

In [None]:
np.array((vectors)).shape

In [None]:
data = [ {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"} for i in range(len(vectors)) ]

In [None]:
data[0]

In [None]:
res = client.insert(
    collection_name="demo_collection",
    data=data
)

In [None]:
res = client.search(
    collection_name="demo_collection",
    data=[vectors[0]],
    filter="subject == 'history'",
    limit=2,
    output_fields=["text", "subject"],
)
print(res)

In [None]:
res = client.query(
    collection_name="demo_collection",
    filter="subject == 'history'",
    output_fields=["text", "subject"],
)

In [None]:
res = client.delete(
    collection_name="demo_collection",
    filter="subject == 'history'",
)

In [None]:
print(res)

In [11]:
import pandas as pd
repo = "dask"
df = pd.read_parquet(f"{repo}_issue_with_comments.parquet")

In [12]:
df

Unnamed: 0,number,title,issue_text,label_names,issue_user.login,author_association,issue_user.login_name,issue_user.login_company,issue_user.login_name_company,issue_user.login_email,...,comment_updated_at,comment_reactions.total_count,comment_reactions.+1,comment_reactions.-1,comment_reactions.laugh,comment_reactions.hooray,comment_reactions.confused,comment_reactions.heart,comment_reactions.rocket,comment_reactions.eyes
0,72,Create multiple output arrays with map_blocks,If a user provides a map function that produce...,['array'],mrocklin,MEMBER,Matthew Rocklin,@coiled,Matthew Rocklin (@coiled ),,...,2019-11-29T08:47:50Z,1,1,0,0,0,0,0,0,0
1,72,Create multiple output arrays with map_blocks,If a user provides a map function that produce...,['array'],mrocklin,MEMBER,Matthew Rocklin,@coiled,Matthew Rocklin (@coiled ),,...,2019-11-29T14:52:21Z,1,1,0,0,0,0,0,0,0
2,97,Idea: deferred errors,"Sometimes, it's nice to have defensive checks ...",['core'],shoyer,MEMBER,Stephan Hoyer,@google,Stephan Hoyer (@google ),shoyer@gmail.com,...,2015-03-27T14:51:53Z,0,0,0,0,0,0,0,0,0
3,97,Idea: deferred errors,"Sometimes, it's nice to have defensive checks ...",['core'],shoyer,MEMBER,Stephan Hoyer,@google,Stephan Hoyer (@google ),shoyer@gmail.com,...,2020-06-18T15:52:15Z,0,0,0,0,0,0,0,0,0
4,97,Idea: deferred errors,"Sometimes, it's nice to have defensive checks ...",['core'],shoyer,MEMBER,Stephan Hoyer,@google,Stephan Hoyer (@google ),shoyer@gmail.com,...,2020-06-18T16:20:34Z,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4569,11155,'SeriesGroupBy' object has no attribute 'nuniq...,Please add 'nunique_approx' feature for 'Serie...,['needs info'],LeilaGold,NONE,LeilaGold,,LeilaGold (None),,...,2024-06-04T10:55:17Z,0,0,0,0,0,0,0,0,0
4570,11155,'SeriesGroupBy' object has no attribute 'nuniq...,Please add 'nunique_approx' feature for 'Serie...,['needs info'],LeilaGold,NONE,LeilaGold,,LeilaGold (None),,...,2024-06-04T11:47:53Z,0,0,0,0,0,0,0,0,0
4571,11155,'SeriesGroupBy' object has no attribute 'nuniq...,Please add 'nunique_approx' feature for 'Serie...,['needs info'],LeilaGold,NONE,LeilaGold,,LeilaGold (None),,...,2024-06-05T15:42:07Z,0,0,0,0,0,0,0,0,0
4572,11160,Can not process datasets created by the older ...,**Describe the issue**:\r\nAfter upgrading the...,['needs triage'],dbalabka,NONE,Dmitry Balabka,Ecentria Group,Dmitry Balabka (Ecentria Group),dmitry.balabka@gmail.com,...,2024-06-04T08:09:31Z,0,0,0,0,0,0,0,0,0


In [14]:
df["label_names"].drop_duplicates()

0                                        ['array']
2                                         ['core']
7                                    ['dataframe']
35                                   ['scheduler']
65                             ['dataframe', 'io']
                           ...                    
4446                           ['dataframe', 'p3']
4456    ['dataframe', 'discussion', 'deprecation']
4487      ['dataframe', 'discussion', 'dask-expr']
4502               ['dataframe', 'convert-string']
4506                           ['dataframe', 'p2']
Name: label_names, Length: 183, dtype: object

In [20]:
df["title"][0]

'Create multiple output arrays with map_blocks'

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
from openai import OpenAI
import os
import geopandas as gpd
from langchain_experimental.agents.agent_toolkits import (
    create_pandas_dataframe_agent,
)
from langchain_openai import OpenAI as OpenAI_langchain


client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [18]:
def chat_response(content):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": content}],
    )
    return response.choices[0].message.content

In [22]:
chat_response(f"Give me a one word summary of the following GitHub {repo} issue title: {df['title'][0]}")

'Parallelism'

In [23]:
chat_response(f"Give me a three word summary of the following GitHub {repo} issue title: {df['title'][0]}")

'Multiple output arrays'