In [1]:
import pandas as pd
import tiktoken

df = pd.read_csv('/Users/arad/repos/pp_kaggle_query/csvs/DatasetVersions.csv')
df = df.loc[~df['Description'].isna(),]
df['CreationDate'] = pd.to_datetime(df['CreationDate'])
df.sort_values(by = 'CreationDate',inplace = True,ascending = False)
df.drop_duplicates(subset = 'DatasetId',keep = 'first',inplace = True)
keep_columns = ['DatasetId','Description','Title','Subtitle']
df = df[keep_columns]
df['dataset_description'] = df.apply(lambda row: f"{row['Description']} {row['Title']} {row['Subtitle']}", axis=1)
df = df[['DatasetId','dataset_description']]

In [5]:
# remove descriptions that are too long
embedding_encoding = "cl100k_base"
max_tokens = 8191 # technically max_tokens is 8191?
encoding = tiktoken.get_encoding(embedding_encoding)
# if too long remove description - we know we have a few VERY long ones (unlikely to be in this sample)
df["n_tokens"] = df['dataset_description'].apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]

In [14]:
test_df = df.iloc[0:200]

In [15]:
# import a embedding function that we already wrote
from get_embedding import get_embedding
import time

# and go line by line in the dataframe, and embed each description

start_time = time.time()

test_df['embedding'] = test_df['dataset_description'].apply(get_embedding)
# df['embedding'] = df['dataset_description'].apply(lambda x: get_embedding(x))

end_time = time.time()

print(end_time - start_time)

# discuss how to save the embeddings

42.015986919403076


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['embedding'] = test_df['dataset_description'].apply(get_embedding)


In [17]:
test_df.head()

Unnamed: 0,DatasetId,dataset_description,n_tokens,embedding
1094215,3639274,**✍️ Personal Note**:\nWhile I'm presenting th...,391,"[-0.03541674464941025, -0.03799552097916603, 0..."
1094214,3639234,**✍️ Personal Note from the Creator**:\nI've ...,413,"[-0.021257318556308746, -0.03399205952882767, ..."
1094191,3607951,### **About:**\nThis dataset encompasses the h...,492,"[-0.008166717365384102, -0.004325845278799534,..."
1094190,3625760,"This dataset offers detailed, up-to-date infor...",310,"[-0.03464054688811302, 0.0061747427098453045, ..."
1094189,3636497,This dataset provides comprehensive and up-to-...,351,"[-0.08187591284513474, 0.002666078507900238, 0..."


In [1]:
import chromadb

client = chromadb.PersistentClient(path="/Users/arad/repos/pp_kaggle_query/db")


In [3]:

help(client.get_or_create_collection)

Help on method get_or_create_collection in module chromadb.api.client:

get_or_create_collection(name: str, metadata: Optional[Dict[str, Any]] = None, embedding_function: Optional[chromadb.api.types.EmbeddingFunction[Union[List[str], List[numpy.ndarray[Any, numpy.dtype[Union[numpy.uint64, numpy.int64, numpy.float64]]]]]]] = <chromadb.utils.embedding_functions.ONNXMiniLM_L6_V2 object at 0x105da0c50>, data_loader: Optional[chromadb.api.types.DataLoader[List[Optional[numpy.ndarray[Any, numpy.dtype[Union[numpy.uint64, numpy.int64, numpy.float64]]]]]]] = None) -> chromadb.api.models.Collection.Collection method of chromadb.api.client.Client instance
    Get or create a collection with the given name and metadata.
    Args:
        name: The name of the collection to get or create
        metadata: Optional metadata to associate with the collection. If
        the collection alredy exists, the metadata will be updated if
        provided and not None. If the collection does not exist, the
  

In [16]:
# how do we store embeddings??

# simplest thing to do is just save them to disc as a CSV file
# then read them in whenever we need them

# slightly less simple, but becoming best practice, is to use vector DB
# {"vector_id":1234
#  "vector": [.01,.4,.59,....]
#  "metadata": {"DatasetId":DatasetId}
#  }
# 
import chromadb

client = chromadb.PersistentClient(path="/Users/arad/repos/pp_kaggle_query/db")

collection = client.get_collection(name="kaggle")


In [18]:
collection.count()

30000

In [14]:
df = pd.read_csv('/Users/arad/repos/pp_kaggle_query/csvs/kaggle_datasets_for_chroma.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77411 entries, 0 to 77410
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DatasetId               77411 non-null  int64  
 1   CreatorUserId           77411 non-null  int64  
 2   Description             77411 non-null  object 
 3   Title                   77411 non-null  object 
 4   Subtitle                61491 non-null  object 
 5   Slug                    77411 non-null  object 
 6   TotalUncompressedBytes  77410 non-null  float64
 7   Id                      77411 non-null  int64  
 8   TotalVotes              77411 non-null  int64  
 9   TotalDownloads          77411 non-null  int64  
 10  TagDescriptions         68104 non-null  object 
 11  dataset_description     77411 non-null  object 
 12  n_tokens                77411 non-null  int64  
dtypes: float64(1), int64(6), object(6)
memory usage: 7.7+ MB


Unnamed: 0,DatasetId,CreatorUserId,Description,Title,Subtitle,Slug,TotalUncompressedBytes,Id,TotalVotes,TotalDownloads,TagDescriptions,dataset_description,n_tokens


In [17]:
collection.get(ids=['3607951', '3625760', '3636497', '3639234'],include=['embeddings','metadatas'])

{'ids': ['3607951', '3625760', '3636497', '3639234'],
 'embeddings': [[-0.015405388548970222,
   0.00022249127505347133,
   0.03574744611978531,
   -0.011535435914993286,
   0.05561816692352295,
   0.017538823187351227,
   0.0014357276959344745,
   -0.023070871829986572,
   0.001462860731408,
   -0.011126114055514336,
   0.03324190154671669,
   -0.04782864451408386,
   -0.03140615299344063,
   -0.059190429747104645,
   0.06802186369895935,
   0.06077810376882553,
   -0.037856075912714005,
   0.04008873924612999,
   -0.06931184232234955,
   0.05095437541604042,
   0.048424024134874344,
   -0.020143600180745125,
   -0.025105077773332596,
   0.0004922716179862618,
   0.002338096499443054,
   -0.024063168093562126,
   -0.022661549970507622,
   -0.004967679735273123,
   0.004524247720837593,
   -0.025402765721082687,
   0.00569329597055912,
   -0.02008158154785633,
   -0.00499868905171752,
   0.018779193982481956,
   -0.01516971830278635,
   -0.014040982350707054,
   -0.01196956541389227,
 

In [28]:
test_df['DatasetId'].astype(str).nunique()

200

In [29]:
collection.add(
    documents= list(test_df['dataset_description']),
    ids= list(test_df['DatasetId'].astype(str)),
    embeddings= list(test_df['embedding'])
)

In [35]:
collection.get(ids=list(test_df['DatasetId'].head().astype(str)),include=['embeddings'])

{'ids': ['3607951', '3625760', '3636497', '3639234', '3639274'],
 'embeddings': [[-0.008166717365384102,
   -0.004325845278799534,
   0.03549288958311081,
   -0.011495672166347504,
   0.04033064842224121,
   0.010465851984918118,
   0.0136032123118639,
   -0.022404586896300316,
   -0.013088301755487919,
   0.006388480309396982,
   0.03776807337999344,
   -0.04885660856962204,
   -0.018955884501338005,
   -0.048281822353601456,
   0.0524011068046093,
   0.037576477974653244,
   -0.042533986270427704,
   0.05091624706983566,
   -0.08621754497289658,
   0.031852591782808304,
   0.044473882764577866,
   -0.026631640270352364,
   -0.028715230524539948,
   -0.0073284911923110485,
   -0.007400339003652334,
   -0.022608155384659767,
   -0.03012824058532715,
   0.024524100124835968,
   0.013124225661158562,
   -0.04562344774603844,
   -0.00904086697846651,
   -0.02275185100734234,
   -0.015159917995333672,
   0.006819568108767271,
   -0.005622102413326502,
   -0.0035804228391498327,
   -0.02562

In [None]:
# next steps are 

# tonight - i'll let the embedding model run overnight, to do all of them.
# all embeddings to be generated

# all of them along with the datasetid to be put into the vector database

# then vector database should be deployed - so that it's always available - put it on virtual machine in google cloud

# CLI - use python package click to develop a very minimalistic user experience at the command line

# publish it to PyPi

In [37]:
collection.count()


200

In [4]:
# metadata for one doc
# title, subtitle, size, votes, downloads, file type, download url (kaggle.com/datasets/user-slug/dataset-slug) OR (kaggle.com/datasets/org-slug/dataset-slug).

# one doc
# title
# subtitle
# description
# tags
import pandas as pd
df = pd.read_csv('../csvs/kaggle_datasets_for_chroma.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77411 entries, 0 to 77410
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   DatasetId               77411 non-null  int64  
 1   CreatorUserId           77411 non-null  int64  
 2   Description             77411 non-null  object 
 3   Title                   77411 non-null  object 
 4   Subtitle                61491 non-null  object 
 5   Slug                    77411 non-null  object 
 6   TotalUncompressedBytes  77410 non-null  float64
 7   Id                      77411 non-null  int64  
 8   TotalVotes              77411 non-null  int64  
 9   TotalDownloads          77411 non-null  int64  
 10  TagDescriptions         68104 non-null  object 
 11  dataset_description     77411 non-null  object 
 12  n_tokens                77411 non-null  int64  
dtypes: float64(1), int64(6), object(6)
memory usage: 7.7+ MB


In [5]:
# DatasetId,CreatorUserId,Description,Title,Subtitle,Slug,TotalUncompressedBytes,Id,TotalVotes,TotalDownloads,TagDescriptions,dataset_description,n_tokens
df = pd.read_csv('../csvs/kaggle_datasets_for_chroma.csv')
metadata = ['Title','Subtitle','TotalVotes','TotalDownloads','TotalUncompressedBytes']
vector_metadata = df[metadata].to_dict(orient='records')

In [6]:
vector_metadata

[{'Title': 'Agricultural Commodities Futures Data',
  'Subtitle': 'Cocoa, Coffee, Cotton, Lumber, Orange Juice & Sugar Futures Data (Yahoo Finance)',
  'TotalVotes': 13,
  'TotalDownloads': 455,
  'TotalUncompressedBytes': 0.0},
 {'Title': 'Animal Products Futures Historical Data',
  'Subtitle': 'Lean Hogs and Live Cattle Futures Data from Yahoo Finance.',
  'TotalVotes': 5,
  'TotalDownloads': 91,
  'TotalUncompressedBytes': 0.0},
 {'Title': 'Global Stock Indices Historical Data',
  'Subtitle': 'Daily Updated Historical OHLC Data from Major Stock Indices Around the World.',
  'TotalVotes': 36,
  'TotalDownloads': 867,
  'TotalUncompressedBytes': 0.0},
 {'Title': 'Gold, Silver & Precious Metals Futures Daily Data',
  'Subtitle': 'Historical data on precious metals like Gold, Silver, Palladium, and more.',
  'TotalVotes': 40,
  'TotalDownloads': 1397,
  'TotalUncompressedBytes': 0.0},
 {'Title': 'Oil, Gas & Other Fuels Futures Data',
  'Subtitle': 'Historical data on Fuels and Energy Fu