In [1]:
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.vectorstores.qdrant import Qdrant
from langchain.vectorstores.faiss import FAISS
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from transformers import AutoTokenizer, AutoModel

In [2]:
# with open('games_text.json', 'r', encoding='utf-8') as file:
#     games_text = json.load(file)

with open('ultimate_games.json', 'r', encoding='utf-8') as file:
    ultimate_games = json.load(file)

In [5]:
rag_list = []
for game_title, game_obj in ultimate_games.items():
    for text in ['rawg-description', 'steam-description', 'steam-summary', 'steam-tags', 
                 'igdb-storyline', 'igdb-summary', 'giantbomb-intro', 'giantbomb-description',
                 'metacritics-description', 'wikipedia-gameplay', 'wikipedia-summary', 'wikipedia-plot', 'wikipedia-synopsis']:
        if text in game_obj and len(game_obj[text]) > 5:
            rag_list.append({'name': game_title, 'text': game_obj[text], 'meta': text})

In [6]:
with open('text_test.json', 'w', encoding='utf-8') as file:
    json.dump(rag_list, file)

In [5]:
class CustomE5Embeddings(HuggingFaceEmbeddings):
    def __init__(self, model_name: str = "intfloat/e5-large-v2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def embed_documents(self, texts):
        """
        Embed a list of documents.
        """
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embeddings
        return embeddings.detach().numpy().tolist()

    def embed_query(self, text):
        """
        Embed a query for retrieval.
        """
        return self.embed_documents([text])[0]

In [7]:
documents = []
for item in rag_list:
    text = item.get('text', '')
    # documents.append(Document(page_content=text, metadata={'name': item['name']}))
    documents.append(Document(page_content=text, metadata={'name': item['name'], 'page': item['meta']}))

In [9]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [10]:
chunked_documents = split_documents(documents=documents)

In [11]:
embedder = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
# embedder = HuggingFaceEmbeddings(model_name='intfloat/e5-large-v2')

In [25]:
'''
CHROMA_PATH = "chroma-grained"
db = Chroma.from_documents(
        documents=chunked_documents, 
        embedding=embedder, 
        persist_directory=CHROMA_PATH
    )

# Load Chroma
chroma_db = Chroma(
    embedding_function=embedder,
    persist_directory=CHROMA_PATH 
)

QDRANT_PATH = 'qdrant'
collection_name = 'games'
qdrant_client = QdrantClient(path=QDRANT_PATH)
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),  # E5 embedding size
)
db = Qdrant.from_documents(
    documents=chunked_documents,
    embedding=embedder,
    client=qdrant_client,
    collection_name=collection_name
)

'''
FAISS_PATH = 'faiss-grained'
db = FAISS.from_documents(
    documents=chunked_documents,
    embedding=embedder
)

db.save_local(FAISS_PATH)

FAISS_PATH = 'faiss'
# db = FAISS.load_local(FAISS_PATH, embeddings=embedder, allow_dangerous_deserialization=True)


In [32]:
results = db.similarity_search(query="with an emphasis on close combat and exploration in which the player enters the once-prosperous now-bleak insect kingdom of Hallownest, travels through its various districts, meets friendly inhabitants, fights hostile ones and uncovers the kingdom's history while improving their combat abilities and movement arsenal by fighting bosses and accessing out-of-the-way areas.",
                               k=10, fetch_k=100, filter={
                                             'page': {'$in': ['steam-summary']}})

In [33]:
results

[Document(id='e1437bdc-16c2-4c03-94e0-09783f6c6dce', metadata={'name': 'Hollow Knight', 'page': 'steam-summary'}, page_content="at the kingdom's heart.Game FeaturesClassic side-scrolling action, with all the modern trimmings.Tightly tuned 2D controls. Dodge, dash and slash your way through even the most deadly adversaries.Explore a vast interconnected world of forgotten highways, overgrown wilds and ruined cities.Forge your own path! The world of Hallownest is expansive and open. Choose which paths you take, which enemies you face and find your own way forward.Evolve with powerful new skills and abilities! Gain spells, strength and speed. Leap to new heights on ethereal wings. Dash forward in a blazing flash. Blast foes with fiery Soul!Equip Charms! Ancient relics that offer bizarre new powers and abilities. Choose your favourites and make your journey unique!An enormous cast of cute and creepy characters all brought to life with traditional 2D frame-by-frame animation.Over 130 enemies

In [31]:
CHROMA_PATH = "chroma"
db = Chroma(
    embedding_function=embedder,
    persist_directory=CHROMA_PATH 
)

results = db.similarity_search(query="with an emphasis on close combat and exploration in which the player enters the once-prosperous now-bleak insect kingdom of Hallownest, travels through its various districts, meets friendly inhabitants, fights hostile ones and uncovers the kingdom's history while improving their combat abilities and movement arsenal by fighting bosses and accessing out-of-the-way areas.",
                               k=10, filter={'name': {'$nin': ['Hollow Knight']}})

In [32]:
results

[Document(metadata={'name': 'Kingdom Two Crowns'}, page_content="Summary: About This Game Build - Explore - Defend - Conquer A shroud of mystery envelops these uncharted medieval lands where ancient monuments, relics and mythical creatures await. Echoes of bygone eras speak of past greatness and in Kingdom Two Crowns, part of the award-winning franchise Kingdom, you embark on an adventure as the Monarch. In this side-scrolling journey atop your steed, you recruit loyal subjects, build your kingdom and protect your crown from the Greed, monstrous creatures looking to steal your kingdom's treasures. Journey alone or together - in Kingdom Two Crowns, a duo of Monarchs can work cooperatively to build and forge a Kingdom that stands the test of time. Lay the foundation of a mighty Kingdom with towering walls and protecting towers while cultivating"),
 Document(metadata={'name': 'Pathfinder - Kingmaker'}, page_content='<p>Conquer new regions as claim them as your own, carving your kingdom fr