# Langchain with Chroma

In [None]:
# import gamelist.xml and convert it to a dict

import os
import json

import xmltodict

the_dict = xmltodict.parse(open('data/arcade/gamelist.xml',encoding="utf-8").read())

print(len(the_dict['gameList']['game']))


In [None]:
# define a fucnction to embed the game descriptions into a vector store

from ai_layer.vector import MakeCollection, DeleteCollection, VectorStore  

from langchain.docstore.document import Document

import os
import uuid
import datetime

def BuildIndexFromCSV(the_dict):

    document_list = []
    for game in the_dict['gameList']['game']:
      # index only games that have all the required fields
      if 'desc' in game and 'name' in game: 
        # clean up the data before embedding
        if not 'releasedate' in game or game['releasedate'] == None:
          game['releasedate'] = "19701231T000000"
        if not 'players' in game or game['players'] == None:
          game['players'] = "1"
        else:
          if len(game['players']) > 1:
            game['players'] = game['players'][-1]
        if not 'publisher' in game or game['publisher'] == None:
          game['publisher'] = "Unknown"
        if not 'developer' in game or game['developer'] == None:
          game['developer'] = "Unknown"
        if not 'genre' in game or game['genre'] == None:
          game['genre'] = "Unknown"
        else:
          if ',' in game['genre']:
            game['genre'] = game['genre'].split(',')[-1]
          
        this_doc = Document(
                    page_content= game['desc'],
                    metadata= {
                      'name': game['name'],
                      'genre': game['genre'],
                      'players': game['players'],
                      'releasedate': str(datetime.datetime.strptime(game['releasedate'][:8], "%Y%m%d").date()),
                      'publisher': game['publisher'],
                      'developer': game['developer'] 
                    })
        document_list.append(this_doc)
    
    try:
      DeleteCollection("gdata")
      print("Deleted existing collection gdata ")
    except:
      print("Collection gdata does not exist, creating new one")
      pass

    MakeCollection("gdata")

    vectorstore = VectorStore(collection_name="gdata")

    # iterate through docs and index them
    texts = []
    metadatas = []
    batch_limit = 100

    for doc in document_list:
        # each doc will have page_content and metadata
        # append these to current batches
        texts.append(doc.page_content)
        metadatas.append(doc.metadata)

        # if we have reached the batch_limit we can add texts
        if len(texts) >= batch_limit:
            vectorstore.add_texts(texts, metadatas)
            # print(f"Embedded and inserted {batch_limit} chunks ...")
            texts = []
            metadatas = []

    if len(texts) > 0:
        vectorstore.add_texts(texts, metadatas)
        #print(f"Embedded and inserted {len(texts)} chunks ...")

In [None]:
# call the function to build/rebuild the index

BuildIndexFromCSV(the_dict)

In [None]:
# Now we can use the vector store to perform a similarity search

from ai_layer.vector import VectorStore
vector_store = VectorStore(collection_name="gdata")
# Perform a similarity search with a filter on the metadata
results = vector_store.similarity_search_with_score(
    "golf games",
    k=20,
)
# filter out good matches with a >90% similarity
good_matches = []
for doc, score in results:
    if score <= 0.90:
        good_matches.append((doc.page_content,doc.metadata))

print("Found " + str(len(good_matches)) + " high quality matches ...\n")
for content, metadata in good_matches:
    print(f"NAME: {metadata['name']}\nDESCRIPTION: {content}" )
    print("\n")

# Semantic Kernel with Chroma

In [None]:
# initialize a new kernel

import semantic_kernel as sk
import os

from semantic_kernel.connectors.ai.open_ai import OpenAIEmbeddingPromptExecutionSettings, OpenAITextEmbedding

kernel=sk.Kernel()
api_key = os.environ['OPENAI_API_KEY']

embedding_function = OpenAITextEmbedding(
    ai_model_id="text-embedding-3-small",
    api_key=api_key,
)

# Note: not all models support specifying the dimensions or there may be constraints on the dimensions
request_settings = OpenAIEmbeddingPromptExecutionSettings(dimensions=3072)

In [None]:
# define a data model for the game data
from uuid import uuid4

from dataclasses import dataclass, field
from typing import Annotated
from semantic_kernel.data import (
    DistanceFunction,
    IndexKind,
    VectorStoreRecordDataField,
    VectorStoreRecordDefinition,
    VectorStoreRecordKeyField,
    VectorStoreRecordVectorField,
    vectorstoremodel,
)

@vectorstoremodel
@dataclass
class Game:
    game_name: Annotated[str, VectorStoreRecordDataField(is_filterable=True)]
    game_genre: Annotated[str, VectorStoreRecordDataField(is_filterable=True)]
    game_players: Annotated[str, VectorStoreRecordDataField(is_filterable=True)]
    game_releasedate: Annotated[str, VectorStoreRecordDataField(is_filterable=True)]
    game_publisher: Annotated[str, VectorStoreRecordDataField(is_filterable=True)]
    game_developer: Annotated[str, VectorStoreRecordDataField(is_filterable=True)]
    description: Annotated[str, VectorStoreRecordDataField(is_full_text_searchable=True)]
    description_embedding: Annotated[list[float], VectorStoreRecordVectorField()]
    game_id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
    #tags: Annotated[list[str], VectorStoreRecordDataField(is_filterable=True)]

In [None]:
# connect to the vector store and collection
from semantic_kernel.connectors.memory.chroma import ChromaStore

# Create a ChromaStore VectorStore object, this will look in the environment for Chroma related settings, and will fall back to the default, which is to run in-memory.
vector_store = ChromaStore(persist_directory="chroma-sk")

# Choose a collection from the database and specify the type of key and record stored in it via Generic parameters.
collection = vector_store.get_collection(
    collection_name="gdata", 
    data_model_type=Game
)

In [None]:
# define an embedding function to embed the game descriptions into a vector store
async def GenerateEmbeddingAsync(textToVectorize: str) -> list[float]:
    embedding = await embedding_function.generate_embeddings(textToVectorize)
    return embedding.tolist() if embedding is not None else None

In [None]:
# define a function to embed the game descriptions into a vector store

from uuid import uuid4
import datetime

async def BuildIndexSK(the_dict):
    # Create the collection if it doesn't exist yet.
    await collection.create_collection_if_not_exists()

    for game in the_dict['gameList']['game']:
      # index only games that have all the required fields
      if 'desc' in game and 'name' in game: 
        if game['desc'] and game['desc'] != "":
          print(".",end="",flush=True)
          # clean up the data before embedding
          if not 'releasedate' in game or game['releasedate'] == None:
            game['releasedate'] = "19701231T000000"
          if not 'players' in game or game['players'] == None:
            game['players'] = "1"
          else:
            if len(game['players']) > 1:
              game['players'] = game['players'][-1]
          if not 'publisher' in game or game['publisher'] == None:
            game['publisher'] = "Unknown"
          if not 'developer' in game or game['developer'] == None:
            game['developer'] = "Unknown"
          if not 'genre' in game or game['genre'] == None:
            game['genre'] = "Unknown"
          else:
            if ',' in game['genre']:
              game['genre'] = game['genre'].split(',')[-1]
            
          await collection.upsert(Game(
              game_id = str(uuid4()),
              game_name = game['name'],
              game_genre = game['genre'],
              game_players = game['players'],
              game_publisher = game['publisher'],
              game_developer = game['developer'],
              game_releasedate = str(datetime.datetime.strptime(game['releasedate'][:8], "%Y%m%d").date()),
              description = game['desc'],
              description_embedding = await GenerateEmbeddingAsync(game['desc']),
              #tags = ["luxury", "pool"]
          ))

In [None]:
# import gamelist.xml and convert it to a dict

import os
import json

import xmltodict

the_dict = xmltodict.parse(open('data/arcade/gamelist.xml',encoding="utf-8").read())

print(len(the_dict['gameList']['game']))


In [None]:
# This may take some time as we call OpenAI embedding API for each row
await BuildIndexSK(the_dict)