In [1]:
import os
import clip
import json
import torch
import pickle
from PIL import Image
import torch.nn as nn
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [7]:
# Step 1: Load the text model (SentenceTransformer) and image model (CLIP)
# Text model
text_model = SentenceTransformer('all-MiniLM-L6-v2')

# Image model (CLIP)
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load('ViT-B/32', device)

In [3]:
# Step 2: Building the corpus
with open('ultimate_games_text.json', 'r') as file:
    ultimate_games_text = json.load(file)

corpus = []
titles = []

for game_title, game in ultimate_games_text.items():
    titles.append(game_title)
    corpus.append(game_title + ':\n\n' + '\n\n'.join([game[key] for key in game.keys() if key != 'Similar Games']))

In [22]:
games_with_pictures = dict()
pic_path = 'Pictures/'
sub_folders = ['#', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
               'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

for letter in sub_folders:
    for file in os.listdir(pic_path+letter+'/'):
        game_name = file.split('_')[0].strip()
        if game_name not in games_with_pictures:
            games_with_pictures[game_name] = []
        games_with_pictures[game_name].append(pic_path+letter+'/'+file)

In [5]:
words_subs = {'1': ['i', 'one', '1'], 'one': ['i', 'one', '1'], 'i': ['i', 'one', '1'],
              '2': ['ii', 'two', '2'], 'two': ['ii', 'two', '2'], 'ii': ['ii', 'two', '2'],
              '3': ['iii', 'three', '3'], 'three': ['iii', 'three', '3'], 'iii': ['iii', 'three', '3'],
              '4': ['iv', 'four', '4'], 'four': ['iv', 'four', '4'], 'iv': ['iv', 'four', '4'],
              '5': ['v', 'five', '5'], 'five': ['v', 'five', '5'], 'v': ['v', 'five', '5'],
              '6': ['vi', 'six', '6'], 'six': ['vi', 'six', '6'], 'vi': ['vi', 'six', '6'],
              '7': ['vii', 'seven', '7'], 'seven': ['vii', 'seven', '7'], 'vii': ['vii', 'seven', '7'],
              '8': ['viii', 'eight', '8'], 'eight': ['viii', 'eight', '8'], 'viii': ['viii', 'eight', '8'],
              '9': ['ix', 'nine', '9'], 'nine': ['ix', 'nine', '9'], 'ix': ['ix', 'nine', '9'],
              '10': ['x', 'ten', '10'], 'ten': ['x', 'ten', '10'], 'x': ['x', 'ten', '10']}
import string
alphabet = list(string.ascii_uppercase)
import unicodedata
import re
import copy

In [14]:
def get_best_match(candidates, title):
    best_match = 0
    best_index = 0
    best_score = 0
    # title = ''.join(ch for ch in title if ch.isalnum() or ch == ' ')
    title = title.replace('&', 'and')
    title_words = [v.translate(str.maketrans('', '', string.punctuation))
                       .lower().strip() for v in re.sub('/|_|-|:', ' ', title).split(' ')]
    # title_words = [v for v in title_words if v != '']
    title_words = [unicodedata.normalize('NFKD', v).encode('ASCII', 'ignore').decode('utf-8')
                   for v in title_words if v != '']
    for ind, candidate in enumerate(candidates):
        try:
            # candidate = ''.join(ch for ch in candidate if ch.isalnum() or ch == ' ')
            temp_candidate = candidate.replace('video game', '')
            temp_candidate = temp_candidate.replace('&', 'and')
            candidate_words = [v.translate(str.maketrans('', '', string.punctuation))
                                   .lower().strip() for v in re.sub('/|_|-|:', ' ', temp_candidate).split(' ')]
            # candidate_words = [v for v in candidate_words if v != '']
            candidate_words = [unicodedata.normalize('NFKD', v).encode('ASCII', 'ignore').decode('utf-8')
                               for v in candidate_words if v != '']
            nb_common_words = 0
            if len(title_words) < len(candidate_words):
                smaller_title = title_words
                bigger_title = copy.copy(candidate_words)
            else:
                smaller_title = candidate_words
                bigger_title = copy.copy(title_words)
        
            for word in smaller_title:
                if word in bigger_title:
                    nb_common_words += 1
                    bigger_title.remove(word)
                elif word in words_subs:
                    for sub_word in words_subs[word]:
                        if sub_word in bigger_title:
                            nb_common_words += 1
                            bigger_title.remove(sub_word)
            max_length = max(len(title_words), len(candidate_words))
            nb_smaller_words = nb_common_words / len(smaller_title)
            nb_common_words /= max_length
        
            # score = (nb_smaller_words + nb_common_words) / 2
            if nb_common_words > best_match:
                best_match = nb_common_words
                best_score = nb_smaller_words
                best_index = ind
        except Exception:
            continue

    return best_index, best_match, best_score

In [8]:
# Step 3: Function to encode text and images
projection_layer_image = nn.Linear(512, 384).to(device)

def encode_text(text):
    return text_model.encode(text, convert_to_tensor=True)

def encode_images(image_paths):
    image_embeddings = []
    for image_path in image_paths:
        image = Image.open(image_path)
        image_input = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = clip_model.encode_image(image_input)
        image_embeddings.append(image_embedding.cpu())
    return torch.stack(image_embeddings)

In [None]:
# Step 4, Encode everything
all_text_embedding = dict()
all_images_embedding = dict()
all_combined_embedding = dict()
games_with_pictures_names = sorted(games_with_pictures.keys())
for ind, game in enumerate(tqdm(corpus)):
    text_embedding = encode_text(game)
    all_text_embedding[titles[ind]] = text_embedding
    best_match = get_best_match(games_with_pictures_names, titles[ind])
    print(titles[ind], games_with_pictures_names[best_match[0]])
    if best_match[1] > 0.7:
        try:
            image_embeddings = encode_images(games_with_pictures[games_with_pictures_names[best_match[0]]])
            image_embeddings = projection_layer_image(image_embeddings.mean(dim=0)[0])
            all_images_embedding[titles[ind]] = image_embeddings
            combined_embedding = (text_embedding + image_embeddings) / 2
            all_combined_embedding[titles[ind]] = combined_embedding
        except Exception as e:
            pass

In [19]:
# Step 5: Function to retrieve similar documents based on a query image or text
def retrieve_similar_documents(query, embeddings, num_similar=5):
    if isinstance(query, str):  # Query is a text
        query_embedding = encode_text(query)
    elif isinstance(query, Image.Image):  # Query is an image
        image_input = preprocess(query).unsqueeze(0).to(device)
        with torch.no_grad():
            query_embedding = projection_layer_image(clip_model.encode_image(image_input).cpu().mean()[0])
    else:
        raise ValueError("Query should be either text or image.")
    
    # Combine query with document embeddings (same as document embedding creation)
    # If query is an image, you can combine it with text or average with a set of images, etc.
    similarities = util.pytorch_cos_sim(query_embedding, embeddings)
    
    # Get top similar document indices
    top_results = torch.topk(similarities, num_similar)
    
    return top_results

In [51]:
len(all_text_embedding), len(all_images_embedding), len(all_combined_embedding), len(games_with_pictures)

(2523, 2126, 2126, 2300)

In [None]:
with open('all_text_emb.pkl', 'wb') as file:
    pickle.dump(all_text_embedding, file)
with open('all_image_emb.pkl', 'wb') as file:
    pickle.dump(all_images_embedding, file)
with open('all_combined_emb.pkl', 'wb') as file:
    pickle.dump(all_combined_embedding, file)

In [None]:
# Loading embeddings
with open('all_text_emb.pkl', 'rb') as file:
    all_text_embedding = pickle.load(file)
with open('all_image_emb.pkl', 'rb') as file:
    all_images_embedding = pickle.load(file)
with open('all_combined_emb.pkl', 'rb') as file:
    all_combined_embedding = pickle.load(file)

In [18]:
list(all_text_embedding.values())[0]

tensor([-6.9713e-02,  1.6753e-03, -6.9786e-02, -8.0465e-02,  1.9524e-02,
        -4.4942e-02, -5.2986e-03,  7.7081e-02,  7.7209e-02,  7.5884e-02,
        -2.0209e-02, -6.1343e-02, -1.4746e-02,  7.7292e-02,  3.0814e-02,
        -1.0123e-02,  5.9935e-02, -2.6663e-02, -6.0594e-02,  2.5211e-02,
         6.4717e-02, -1.1894e-02,  5.2146e-02, -3.0130e-02, -7.4924e-02,
         2.8019e-02,  3.4252e-02, -3.2484e-02, -5.9556e-02, -1.3027e-02,
        -3.8493e-03, -6.3570e-03, -1.2164e-02, -5.2543e-03,  1.9540e-02,
        -5.5622e-02,  1.8956e-02, -3.4239e-02, -7.4016e-02, -1.5767e-03,
        -6.9664e-02,  2.4243e-02, -2.2924e-02,  5.8055e-02,  6.1973e-02,
        -4.4439e-02, -6.3255e-02,  1.3631e-04, -4.5610e-02, -7.8014e-03,
        -2.1684e-02, -9.4209e-02, -1.4638e-02,  1.8784e-02,  1.9070e-02,
        -4.8618e-02, -1.8275e-02,  2.3526e-02,  4.0565e-02,  4.1312e-02,
        -1.5753e-02, -8.0723e-02,  1.0350e-01,  1.4397e-02,  6.0781e-02,
         2.0707e-02,  7.5458e-02, -1.9128e-02, -5.4

In [33]:
# Example usage:
# Search by text
query_text = "Forge your own path in this game. An epic action adventure through a vast ruined kingdom of insects and heroes. Explore twisting caverns, battle tainted creatures and befriend bizarre bugs, all in a classic, hand-drawn 2D style. "
titles = sorted(all_text_embedding.keys())
embeddings = [all_text_embedding[v].numpy() for v in titles]
similar_docs_text = retrieve_similar_documents(query_text, embeddings, num_similar=10)

# Search by image
# query_image = Image.open('images/query_cat.jpg')  # This should be your query image
# similar_docs_image = retrieve_similar_documents(query_image, num_similar=3)

# Output the results for both
print("\nTop similar documents based on text query:")
for idx, score in zip(similar_docs_text.indices[0], similar_docs_text.values[0]):
    print(titles[idx.item()], f"- Document {idx.item() + 1} with similarity score: {score.item():.4f}")

# print("\nTop similar documents based on image query:")
# for idx, score in zip(similar_docs_image.indices[0], similar_docs_image.values[0]):
#     print(f"- Document {idx.item() + 1} with similarity score: {score.item():.4f}")



Top similar documents based on text query:
Demon Turf - Document 478 with similarity score: 0.6054
Metamorphosis - Document 1246 with similarity score: 0.5870
TUNIC - Document 2036 with similarity score: 0.5795
DYSMANTLE - Document 423 with similarity score: 0.5678
Blue Fire - Document 243 with similarity score: 0.5663
BIOMORPH - Document 151 with similarity score: 0.5580
Souldiers - Document 1888 with similarity score: 0.5579
Hollow Knight - Document 939 with similarity score: 0.5464
Gloomhaven - Document 813 with similarity score: 0.5426
Dust An Elysian Tail - Document 599 with similarity score: 0.5422
