In [12]:
# this version is only compatible with tag v0.0.6
!git clone https://github.com/atlantis-nova/simtag.git

Cloning into 'simtag'...


In [None]:
import warnings

# move directory to the root of this repo
# os.chdir('\\'.join(os.getcwd().split('\\')[:-3]))
warnings.simplefilter("ignore")

import pandas as pd
from simtag.simtag.filter import simtag_filter
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer
from collections import Counter
from sentence_transformers.util import cos_sim
import textwrap

In [46]:
# import raw data
df = pd.read_parquet('simtag/notebooks/steam-games/games.parquet').dropna()
df['Tags'] = df['Tags'].apply(lambda x : x.split(','))
df['Genres'] = df['Genres'].apply(lambda x : x.split(','))
df = df.drop(['game_vector', 'game_indices', 'Score', 'Recommendations'], axis=1)

# extract raw lists
sample_list = df['Tags'].values.tolist()

In [47]:
# we instatiate the model first, so we can assign it to multiple copies of the engine
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu')

# initiate engine
engine = simtag_filter(
	model=model
)

In [48]:
M, valid_tags, pointers = engine.compute_optimal_M(sample_list)
engine.load_M(M, pointers, covariate_transformation='dot_product')
engine.index_tags = engine.compute_index(data=M, k=len(engine.tag_list))
engine.index_tags

100%|██████████| 446/446 [00:10<00:00, 44.22it/s]


### covariate tagging

In [None]:
def covariate_tagging(text, top_tags=100, min_window=2, max_window=7, window_length=5, mc1=1000, top_mc1=20, score_threshold=0.01, verbose=False):

	print(textwrap.fill(text, width=120))

	#
	base_vector = engine.model.encode(text)

	# find the top tags
	distances, indices = engine.index_tags.kneighbors([base_vector])

	top_tags = [engine.tag_list[x] for x in indices[0]]

	# scrolling window
	mc = list()
	min_window, max_window = window_length, window_length
	for j in range(min_window, max_window+1):
		for k in range(int(len(top_tags)/j)+1):
			ranges = [k*j, (k+1)*j]
			choice = top_tags[ranges[0]:ranges[1]]
			if len(choice) > 0:
				mc.append(choice)
	mc_encoded = engine.encode_samples(mc, quantize_samples=False, show_progress=True)
	nbrs_mc = NearestNeighbors(n_neighbors=top_mc1, metric='cosine').fit(mc_encoded)
	distances, indices = nbrs_mc.kneighbors([engine.model.encode(text)])

	results = [mc[x] for x in indices[0]]
	top_tags = [x for xs in results for x in xs] # in case we want to pass it to the next monte-carlo

	tag_freq = Counter(tag for game_tags in results for tag in game_tags)
	tag_freq = dict(tag_freq)

	top_m = sorted([[tag_freq[key], key] for index, key in enumerate(tag_freq)])[::-1]
	top_m = [x[1] for x in top_m]

	accepted = list()
	#
	best_selection_score = 0
	for m in range(len(top_m)):
		best_tag = ''
		score = 0
		for k in top_m:
			if k not in accepted:
				iteration = accepted + [k]
				vector_iteration = engine.encode_samples([iteration], quantize_samples=False, show_progress=False)[0]
				base_vector = base_vector.astype(vector_iteration.dtype)  # cast m2 to match m1's dtype
				vector_score = cos_sim(base_vector, vector_iteration).tolist()[0][0]
				if vector_score > score:
					score = vector_score
					best_tag = k
				else:
					pass
		sequence_score = score
		accepted.append(best_tag)

		if sequence_score > best_selection_score and m > 0 and ((sequence_score-best_selection_score)/best_selection_score) > score_threshold:
			best_selection_score = sequence_score
		elif m == 0:
			best_selection_score = sequence_score
		else:
			break
			# pass

	return accepted, mc

accepted, mc = covariate_tagging(df.iloc[3]['About the game'], top_tags=100, window_length=5, score_threshold=0.01, verbose=True)
accepted

Astro Tripper is a furious shoot-em-up experience inspired by the painful, knuckle busting video games of years gone by.
An evolution of PomPom's 2001 award winning game, Space Tripper. Travel the surface of large horizontally-scrolling
platforms, and pit your highly maneuverable craft against hoards of enemies eager to blow you to bits, eat you or worse.
Gameplay is simple. You are free to travel anywhere on the platform at anytime, but don’t fall off! Powerful Blue Lasers
and Red Spread Lasers come equipped, so use both weapons tactically to get through tricky situations. Power up your
weapons with power crystals dropped by destroyed enemies. Oh, did we mention the blasting absolutely anything that
moves? Enemies encountered are varied. Zippy UFOS. Fat Motherships. Tanks. Choppers. Slimy Slug creatures. Alien
Insects. Aztec structures brimming with electricity. And of course, Big Bosses! 14 unique levels spread over 4 diverse
worlds means there’s always something new just around the 

processing samples: 100%|██████████| 90/90 [00:00<00:00, 750.08it/s]


['Spaceships',
 'Tactical RPG',
 'Card Battler',
 'Top-Down Shooter',
 'Aliens',
 'Turn-Based Strategy',
 'Open World Survival Craft',
 'Jump Scare']

### comparison with bart-large-mnli

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
sequence_to_classify = df.iloc[3]['About the game']
candidate_labels = engine.tag_list
classifier(sequence_to_classify, candidate_labels)

{'sequence': "'MazM: Jekyll and Hyde' is a darkly entertaining adventure game based on the classic 1886 novel 'The Strange Case of Dr. Jekyll and Mr. Hyde' by Robert Louis Stevenson, in which you'll tackle the mystery from a totally new angle! You'll travel back to 19th century London and view the city through the eyes of Mr. Utterson, a lawyer that walks the true path hunting for clues to solve a disturbing mystery, and Mr. Hyde, who has been pushed to his physical limits. Wander the streets of this psychological thriller and prepare for an ending you would never expect! The version of London presented in 'MazM: Jekyll and Hyde' has a dark, heavy atmosphere, creating a sense of the eerie and macabre. The stunning artwork and unsettling music help to further intensify the disturbing nature of the game. Travel throughout London searching for clues, and allow the world of this classic novel to envelop you as you experience the tale of one man's many challenges and potential downfall!",
 

### comparison with zero-shot-implicit-bi-encoder

In [None]:
from sentence_transformers import SentenceTransformer, util as sbert_util

model = SentenceTransformer(model_name_or_path='claritylab/zero-shot-implicit-bi-encoder')

In [None]:
text = df.iloc[3]['About the game']
labels = engine.tag_list
aspect = 'best tag:' # classification
aspect_sep_token = model.tokenizer.additional_special_tokens[0]
text = f'{aspect} {aspect_sep_token} {text}'

text_embed = model.encode(text)
label_embeds = model.encode(labels)
scores = [round(sbert_util.cos_sim(text_embed, lb_embed).item(), 3) for lb_embed in label_embeds]
sorted(list(zip(scores, labels)))[::-1]

[(0.871, 'Feature Film'),
 (0.851, 'Thriller'),
 (0.845, 'Documentary'),
 (0.832, 'Dark Comedy'),
 (0.813, 'Action RPG'),
 (0.791, 'Cinematic'),
 (0.774, "1990's"),
 (0.731, 'Action Roguelike'),
 (0.706, 'RPG'),
 (0.696, 'On-Rails Shooter'),
 (0.695, 'Based On A Novel'),
 (0.691, 'Drama'),
 (0.629, 'Movie'),
 (0.614, 'Strategy RPG'),
 (0.609, 'Remake'),
 (0.596, 'Sci-fi'),
 (0.568, 'Action-Adventure'),
 (0.554, 'Party-Based RPG'),
 (0.531, 'Tactical RPG'),
 (0.525, 'Video Production'),
 (0.498, 'MMORPG'),
 (0.492, 'RPGMaker'),
 (0.477, 'Short'),
 (0.477, 'Sequel'),
 (0.448, 'Epic'),
 (0.443, 'Dungeon Crawler'),
 (0.44, 'Character Action Game'),
 (0.427, 'Anime'),
 (0.417, 'Fox'),
 (0.403, 'VR Only'),
 (0.391, 'Cult Classic'),
 (0.39, 'Interactive Fiction'),
 (0.385, 'Immersive Sim'),
 (0.384, 'Comedy'),
 (0.348, 'Mystery Dungeon'),
 (0.336, 'Episodic'),
 (0.333, 'Visual Novel'),
 (0.326, '360 Video'),
 (0.323, 'Indie'),
 (0.313, 'Arcade'),
 (0.313, '3D Fighter'),
 (0.309, 'Cartoon'),
 