In [9]:
import pandas as pd
import numpy as np
import re

df = pd.read_parquet('output/sentiments.parquet')
df.dropna(inplace=True)
df = df[(df['review'].apply(lambda x: re.match(r'^[()\[\]{}\'!,.: ]+$', x) is None)) & (df['review'].apply(lambda x: len(x) > 1))]
df.drop_duplicates(subset='review', ignore_index=True, inplace=True)
df.reset_index(drop=True, inplace=True)

df['sentiment'] = df['sentiment'].map({
    'Very Negative': 1,
    'Negative': 2,
    'Neutral': 3,
    'Positive': 4,
    'Very Positive': 5
})

df['game'] = df['game'].astype('category')

df

Unnamed: 0,topic,review,embedding,sentiment,game
0,gameplay mechanics,Excellent game mechanics hampered by really sh...,"[0.02064609, 0.058051355, -0.15955217, -0.0282...",2,1653100
1,gameplay mechanics,I love the narrator just talking in the backgr...,"[-0.01935661, 0.021597555, -0.18828672, -0.039...",4,1653100
2,gameplay mechanics,The game has received frequent and substantial...,"[0.030498626, 0.035088323, -0.17761268, -0.017...",4,1653100
3,gameplay mechanics,Heliotropism is a roguelike Tree simulator wit...,"[-0.005617, 0.06219709, -0.17552276, -0.042904...",4,1653100
4,narrator and audio,"the narrator is cool at first, then hyper anno...","[0.01688186, 0.010326986, -0.16718766, -0.0392...",2,1653100
...,...,...,...,...,...
91564,game mechanics,"The core gameplay is rather simple, but honest...","[0.027327502, 0.0755041, -0.17156118, -0.01077...",4,2300300
91565,music and sound,Хорошее музыкальное и звуковое сопровождение.,"[-0.035658363, -0.0012977426, -0.1295827, 0.02...",4,2300300
91566,music and sound,"I like the music, it's nice relaxing.","[-0.020432916, 0.03288259, -0.18042335, -0.001...",4,2300300
91567,price and value,The asking price might be a bit steep for what...,"[0.012826282, 0.04906256, -0.20521845, 0.02795...",3,2300300


In [10]:
games = pd.read_csv('./data/Game List - Final.csv')
games = games.dropna(subset=['SteamID']).rename(columns={'List (merge)': 'Game'})
games = games[games['Final decision'] == 'Yes']
games['SteamID'] = games['SteamID'].astype(int).astype(str).astype('category')

#df = games[['Game', 'SteamID']].merge(df, left_on='SteamID', right_on='game', how='inner')
#df

In [11]:
from src.embed import get_embedding
from sklearn.metrics.pairwise import cosine_similarity

citation_embeddings = np.vstack(df['embedding'].values)

def get_similar_citations(query, embeddings, threshold):    
    query_embedding = get_embedding([query])
    similarities = cosine_similarity(query_embedding, embeddings).flatten()
    indices = np.argwhere(similarities >= threshold)
    return indices.flatten()

In [12]:
import json

queries: dict = json.load(open('data/queries.json'))

results = {
    category: {
        q['name']: df.loc[get_similar_citations(q['name'], citation_embeddings, q['similarity_threshold'])]
        for q in cat_queries
    }
    for category, cat_queries in queries.items()
}

In [13]:
def get_table(results, agg_fn):
    df = pd.DataFrame()
    for category, queries in results.items():
        for query, citations in queries.items():
            df[(category, query)] = agg_fn(citations)
    df.columns = pd.MultiIndex.from_tuples(df.columns, names=['category', 'query'])
    return df

In [14]:
# count occurrences of each game for each query

get_table(results, lambda x: x['game'].value_counts())

category,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Player Experience and Engagement,...,Technical and Performance Aspects,Technical and Performance Aspects,Technical and Performance Aspects,Social and Educational Aspects,Social and Educational Aspects,Social and Educational Aspects,More than human,More than human,More than human,More than human
query,Puzzle,Survival,Sandbox,Building and Crafting,Combat System and Action,Economy and Management,Simulation,Storytelling and Narrative,Achievements and Rewards,Learning curve and Difficulty,...,Controls and Input,Accessibility Features,Navigation and Menu Systems,Multiplayer,Gaming Community Dynamics,Educational Value,Environmentalism and Ecology,Animal,Nature,Apocalypse
game,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1593030,54,0,5,39,1,6,4,0,17,80,...,1,0,1,4,0,2,16,3,10,0
1875060,23,0,1,8,1,3,1,0,11,19,...,2,0,0,2,0,0,0,0,0,0
1372320,12,0,62,4,0,0,0,2,1,13,...,4,0,0,0,0,0,1,2,3,2
2300300,7,0,0,0,0,0,0,0,4,6,...,0,0,0,0,0,0,0,0,1,0
222730,6,0,11,10,0,8,8,3,50,95,...,8,0,0,4,1,0,0,0,1,0
2109270,5,0,1,0,0,0,0,0,2,5,...,0,0,0,0,0,0,0,0,1,0
1062090,4,16,64,539,2,78,32,7,53,320,...,20,4,4,44,7,0,0,14,5,0
1575510,3,0,0,0,0,0,0,0,1,1,...,3,0,0,1,0,0,0,0,0,0
914750,2,0,0,0,4,0,9,7,10,3,...,3,0,0,5,0,3,0,0,0,0
1446000,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# average sentiment for each query

get_table(results, lambda x: x.groupby('game', observed=False)['sentiment'].mean())

category,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Gameplay Mechanics,Player Experience and Engagement,...,Technical and Performance Aspects,Technical and Performance Aspects,Technical and Performance Aspects,Social and Educational Aspects,Social and Educational Aspects,Social and Educational Aspects,More than human,More than human,More than human,More than human
query,Puzzle,Survival,Sandbox,Building and Crafting,Combat System and Action,Economy and Management,Simulation,Storytelling and Narrative,Achievements and Rewards,Learning curve and Difficulty,...,Controls and Input,Accessibility Features,Navigation and Menu Systems,Multiplayer,Gaming Community Dynamics,Educational Value,Environmentalism and Ecology,Animal,Nature,Apocalypse
game,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1028590,,,,,,,4.0,,,,...,,,,4.0,,,,,,
1062090,3.0,3.875,3.84375,4.09833,3.0,3.987179,4.0,2.714286,3.45283,3.49375,...,3.3,2.75,2.25,4.022727,4.571429,,,3.214286,3.0,
1133120,,,4.0,,,,4.5,,,2.5,...,,,2.0,,,,4.0,,3.0,
1346210,,,,,,,,,,3.0,...,,,,,,,,,,
1372320,4.333333,,3.774194,4.25,,,,4.0,3.0,3.461538,...,2.75,,,,,,4.0,4.0,2.333333,3.5
1390190,,,4.4,4.0,3.0,3.666667,5.0,,5.0,3.181818,...,,4.5,4.333333,2.583333,,,5.0,,,
1446000,4.0,,,,,,,,4.0,,...,,,,,,,,,,
1450250,,,,3.0,,,,4.0,,,...,,,,,,,2.0,,,
1539580,,,,,,,3.0,5.0,3.0,2.0,...,2.5,3.0,,3.166667,,,,3.0,3.5,
1575510,3.666667,,,,,,,,3.0,3.0,...,3.666667,,,5.0,,,,,,
