In [1]:
import chromadb

client = chromadb.PersistentClient(path="/Users/arad/repos/pp_kaggle_query/db")
collection = client.get_collection(name="kaggle")
collection.count()

30000

In [None]:
client.list_collections()

In [2]:
user_query_collection = client.get_collection(name="user")

30000

In [4]:
user_query_collection.count()

4

In [7]:
import openai
import chromadb.utils.embedding_functions as embedding_functions
from dotenv import load_dotenv,find_dotenv
import os

_ = load_dotenv(find_dotenv())
api_key = os.getenv("OPENAI_API_KEY")

openai.api_key = api_key

# from oai via chroma
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=api_key,
                model_name="text-embedding-3-small"
            )

In [9]:
# create user query collection
user_query_collection = client.get_or_create_collection(name="user",
    embedding_function=openai_ef, metadata={"hnsw:space": "cosine"})

In [5]:
# import uuid

# # generate a random uuid
# random_uuid = uuid.uuid4().int & (1<<32)-1

# # convert the uuid to a 32-bit integer
# random_id = random_uuid.int & (1<<32)-1
# print(random_uuid)
# print(random_id)

b3107436-5925-425a-9195-3e335513c7c1
1427359681


In [33]:
import pandas as pd

# collection.get(ids=['3607951', '3625760', '3636497', '3639234'],include=['embeddings','metadatas'])

# we need to make another "collection" for user queries
# when a user writes a query
## we add it to the user query collection (this will automatically embed it)
## we get the user query embedding out of that collection
## and pass it to the query function on the dataset description collection.


# request = "I am looking for data on Indian crop yields"

# creating a query_id
# embedding/vectorizing the user query
# finding similar datasets
# replying to the user is a third thing

def assign_query_id():
    import uuid
    return uuid.uuid4().int & (1<<32)-1

def embed_user_query(query):
    query_id = assign_query_id()
    user_query_collection.add(ids=[str(query_id)], documents=[query])
    query_embedding = user_query_collection.get(ids=[str(query_id)],include=["embeddings"])['embeddings']
    
    return query_embedding

def find_datasets(query_embedding):

    query_response = \
    collection.query(
    query_embeddings=query_embedding,
    n_results=5
    )

    metadata = query_response['metadatas'][0]
    descriptions = query_response['documents'][0]
    
    return descriptions, metadata

def reply_to_user(metadata):
    
    metadata_df = pd.DataFrame(metadata).sort_values('TotalDownloads',ascending=False).reset_index(drop = True)

    print("here's what I found:\n")
    for index,row in metadata_df.iterrows():    
        print(f"{index + 1}. {row['Title']}{row['Subtitle']} -- total downloads: {row['TotalDownloads']}")

    print("\ninterested in any of these?")


def interface(query):
    embedding = embed_user_query(query)
    _, metadata = find_datasets(embedding)
    reply = reply_to_user(metadata)
    print(reply)
    return reply




In [28]:
nhl_request = """ 
i'm looking for a dataset about players in the national hockey league. specifically i want game-level statistics for a large amount of players over
a long period of time.
"""

nhl_embedding = embed_user_query(nhl_request)
nhl_descriptions, nhl_metadata = find_datasets(nhl_embedding)
nhl_reply = reply_to_user(nhl_metadata)


here's what I found:

1. NHL DataPlayer, team, and shots data from 2008-2023 -- total downloads: 181
2. NHL Player and Team Data 2008/9-2021/22Data originated with moneypuck.com and hockey-reference.com -- total downloads: 165
3. 🏒 NHL Database MoneyPuckContains player data, all match data and shots data since 2007 -- total downloads: 97
4. 🏆 National Hockey League Teams Dataset 🏒Explore an extensive dataset containing information about NHL teams. -- total downloads: 55
5. NHL Game Data 2013-2021Data from all regular season NHL games between 2013/14 - 2021/22 inclusive -- total downloads: 22

interested in any of these?


In [30]:
mlb_request = """ 
i'm looking for a dataset about players in major league baseball. specifically i want game-level statistics for a large amount of players over
a long period of time.
"""

interface(mlb_request)

here's what I found:

1. MLB Hitting and Pitching Stats Through All TimeUncovering History of Baseball: A Comprehensive Dataset of Hittimg and Pitching. -- total downloads: 1534
2. 2023 MLB Player Stats2023 Major League Baseball Player Stats -- total downloads: 1389
3. Baseball Player MetricsAnalyzing Performance Across Games, At-Bats, Runs, Hits, and More -- total downloads: 77
4. Major League Baseball Game LogsHistorical MLB Game Logs and Player Statistics from 1871-2016 -- total downloads: 71
5. MLB Batter Game Logs (All Players, 2023 Season)Why would I want to stand out with a snappy title. If you want this you can use. -- total downloads: 60

interested in any of these?
None


In [32]:
olympics_request = """ 
i'm looking for a dataset about olympic athletes. specifically i want event competition statistics for a large amount of athletes over
a long period of time.
"""

interface(olympics_request)

here's what I found:

1. Olympic Historical Dataset From Olympedia.orgEvent to Athlete level Olympic Games Results from Athens 1896 to Beijing 2022 -- total downloads: 3013
2. Olympics Legacy: 1896-2020A Comprehensive Dataset Spanning 124 Years -- total downloads: 1472
3. Olympics-Dataset"Exploring Olympic Glory: A Comprehensive Dataset Unveiling the Rich History. -- total downloads: 388
4. Olympic Athlete Performance DatasetExploring the Triumphs and Diversity of Athlete Achievements in the Olympic Game -- total downloads: 99
5. Olympics Datasetnan -- total downloads: 44

interested in any of these?
None


In [None]:
# telecoms_request = """ 

# telecommunications usage. The dataset should include the usage date, provider ID, user ID, call duration, data usage, SMS count, service type (voice, data, SMS), location data, user demographics, and billing information. The dataset must be clean, with no missing values, consistent identifiers, accurate usage details, and provided in CSV format with clearly labeled columns and standardized date formats. Accompanying metadata describing data sources, preprocessing steps, and field definitions is required. An estimated delivery timeline and a mechanism for accessing updates, such as an API or regular data dumps, would be appreciated. The data will be used for usage pattern analysis and telecom service optimization, making accuracy and completeness critical.

# """
# test_query = answer_users_request(telecoms_request)
# print(test_query)

In [10]:
# epl_request = """ 
# i'm looking for a dataset about players in the english premier league. specifically i want game-level statistics for a large amount of players. over
# a long period of time.
# """

# test_query = answer_users_request(epl_request)

In [19]:
# v0.0

In [None]:
"""
hey user, here's what i found:

1. Title, Subtitle, summary of description
2. Title, Subtitle, summary of description
3. Title, Subtitle, summary of description

do you want to further explore any of these?

"""