In [1]:
import textwrap
import pandas as pd
import numpy as np
import re

games = pd.read_csv('./data/Game List - Final.csv')
games = games.dropna(subset=['SteamID']).rename(columns={'List (merge)': 'game'})
games = games[games['Final decision'] == 'Yes']

games['SteamID'] = games['SteamID'].astype(int).astype(str)
games['game'] = games['game'].apply(lambda x: textwrap.shorten(x, width=20, placeholder='...'))

games['game']

0                   Agonik
2                    APICO
3                 Ardarium
4            Bee Simulator
5                Big Earth
6               Biomisland
7                     Bird
8            Cambrian Dawn
10           Cloud Gardens
11              Coral Cove
12                Crab God
14                     Eco
15               Ecosystem
16                 Endling
17               Equilinox
18                Fishy 3D
19           Firestarter 2
22       Gibbon: Beyond...
23                  Growth
24         Harmonic Depths
26            Heliotropism
31             LumbearJack
32                  Meadow
35    Nature And Life -...
36               Paperbark
37                    Paws
38                Planetka
39                Preserve
41           Rabbit Meadow
43                Regrowth
44                    Reus
45                  Reus 2
46        Sacabambaspis...
47                 Shelter
48               Shelter 2
50               Seedlings
55               Terra Nil
5

In [2]:
df = pd.read_parquet('output/sentiments.parquet')

# Dataset cleaning
df.dropna(inplace=True)
df = df[(df['review'].apply(lambda x: re.match(r'^[()\[\]{}\'!,.: ]+$', x) is None)) & (df['review'].apply(lambda x: len(x) > 1))]

df['sentiment'] = df['sentiment'].map({
    'Very Negative': -2,
    'Negative': -1,
    'Neutral': 0,
    'Positive': 1,
    'Very Positive': 2
})

df.rename(columns={'game': 'SteamID'}, inplace=True)

df = games[['game', 'SteamID']].merge(df, on='SteamID', how='inner').drop(columns=['SteamID'])
df['game'] = df['game'].astype('category')

df

Unnamed: 0,game,topic,review,embedding,sentiment
0,Agonik,relaxing gameplay,"This is a really fun and cozy game, I'm enjoyi...","[0.001759562, 0.04536525, -0.14214593, -0.0048...",2
1,Agonik,relaxing gameplay,A casual and relaxing experience with nice aes...,"[-0.0031292676, 0.064088024, -0.16408579, -0.0...",1
2,Agonik,relaxing gameplay,Good relaxing game and beautiful ambiance.,"[-0.005222113, 0.069819115, -0.17526186, -0.02...",1
3,Agonik,relaxing gameplay,relaxing little game,"[0.040225442, 0.038598843, -0.15387893, -0.040...",1
4,Agonik,relaxing gameplay,"Fun and relaxing, perfect for when you feel li...","[0.016272368, 0.07279064, -0.17117193, -0.0269...",1
...,...,...,...,...,...
96220,United Penguin...,relaxing atmosphere and music,The architectural structures are beautiful and...,"[0.017700141, 0.068711996, -0.13735436, -0.028...",1
96221,United Penguin...,relaxing atmosphere and music,"So much to do, but already feeling super cozy ...","[0.006987029, 0.094214834, -0.14064616, -0.020...",1
96222,United Penguin...,value for money,"top hab schon die Demo gespielt gehabt,ein seh...","[0.0036671096, 0.00556588, -0.17578974, 0.0274...",2
96223,United Penguin...,value for money,"Nun ist es draußen, bei der Rezession spare ic...","[0.045282446, -0.038138654, -0.14663275, 0.022...",-1


In [3]:
from src.embed import get_embedding
from sklearn.metrics.pairwise import cosine_similarity

def get_similars(queries, embeddings, threshold):    
    query_embedding = get_embedding(queries)
    similarities = cosine_similarity(query_embedding, embeddings)
    
    indices = pd.DataFrame(data=np.argwhere(similarities > threshold), columns=['query', 'review'])
    indices['query'] = indices['query'].map(lambda x: queries[x])

    return indices.groupby('query')['review'].apply(list).to_dict()

In [4]:
citations_df = df.drop_duplicates(subset='review')
citation_embeddings = np.vstack(citations_df['embedding'].values)

#all_topics = df['topic'].unique()
#topics_embeddings = get_embedding(list(all_topics))
#pd.DataFrame(data=topics_embeddings, index=all_topics).to_parquet('output/topics.parquet')

all_topics = pd.read_parquet('output/topics.parquet')
topics_embeddings = all_topics.values
all_topics = all_topics.index

In [5]:
def search_similars(queries):
    similars_by_text = get_similars(queries, citation_embeddings, 0.8)
    similars_by_topic = get_similars(queries, topics_embeddings, 0.85)

    results = {}
    for query in queries:
        sim_a = citations_df.iloc[similars_by_text.get(query, [])]

        included_topics = all_topics[similars_by_topic.get(query, [])]
        sim_b = df[df['topic'].isin(included_topics)]

        # Merge and drop duplicates
        sim = pd.concat([sim_a, sim_b]).drop_duplicates(subset='review')
        results[query] = sim[['review', 'sentiment', 'game']]
    
    return results

In [6]:
import json

queries: dict = json.load(open('data/queries.json'))

In [7]:
results = {
    category: search_similars(q)
    for category, q in queries.items()
}

In [8]:
results_df = pd.concat([
    pd.concat([d.assign(category=category, query=q) for q, d in cat_q.items()])
    for category, cat_q in results.items()
])

results_df['query'] = results_df['query'].astype('category')
results_df['category'] = results_df['category'].astype('category')

results_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25946 entries, 3819 to 95780
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   review     25946 non-null  object  
 1   sentiment  25946 non-null  int64   
 2   game       25946 non-null  category
 3   category   25946 non-null  category
 4   query      25946 non-null  category
dtypes: category(3), int64(1), object(1)
memory usage: 686.5+ KB


In [12]:
from itertools import combinations
from collections import defaultdict

grouped = results_df.groupby('query', observed=True)
cooccurrence = defaultdict(int)

# Count co-occurrences of reviews between query pairs
for (query1, reviews1), (query2, reviews2) in combinations(grouped, 2):
    shared_reviews = pd.merge(reviews1, reviews2, on='review', how='inner')
    cooccurrence[(query2, query1)] = shared_reviews

cooccurrence = pd.DataFrame([
    {'query1': q1, 'query2': q2, 'shared_reviews': count}
    for (q1, q2), count in cooccurrence.items()
])

cooccurrence = cooccurrence[cooccurrence['shared_reviews'].apply(lambda x: len(x) >= 10)]

cooccurrence['n'] = cooccurrence['shared_reviews'].apply(len)
cooccurrence['mean'] = cooccurrence['shared_reviews'].apply(lambda x: np.mean(x['sentiment_x']))
cooccurrence['top-3'] = cooccurrence['shared_reviews'].apply(lambda x: x.groupby('game_x', observed=True)['sentiment_x'].mean().nlargest(3).index.tolist())

cooccurrence

Unnamed: 0,query1,query2,shared_reviews,n,mean,top-3
33,Graphics and Aesthetics,Atmosphere and Ambiance,...,260,1.292308,"[Bird, Equilinox, Endling]"
40,Relaxation and Calmness,Atmosphere and Ambiance,...,59,1.542373,"[Equilinox, Cloud Gardens, Distant Bloom]"
58,Map customization,Character customization,...,54,1.0,"[Bee Simulator, Timberborn, Wolf Quest:...]"
91,Enjoyment and Fun,Educational Value,...,14,1.785714,"[Wolf Quest:..., Wolf Quest: Classic]"
102,Realism,Educational Value,...,15,1.6,"[Wolf Quest:..., Bee Simulator]"
133,Graphics and Aesthetics,Enjoyment and Fun,...,17,1.705882,"[Wolf Quest: Classic, Timberborn, Wolf Quest:...]"
137,Multiplayer,Enjoyment and Fun,...,18,1.444444,"[Wolf Quest: Classic, Eco, Wolf Quest:...]"
139,Realism,Enjoyment and Fun,...,34,1.823529,"[Wolf Quest:..., Wolf Quest: Classic, Bee Simu..."
140,Relaxation and Calmness,Enjoyment and Fun,...,1078,1.434137,"[Wolf Quest:..., Equilinox, APICO]"
180,Game pacing,Game length,...,46,-0.065217,"[Endling, Paperbark, Fishy 3D]"


In [32]:
pd.DataFrame(
    index=pd.MultiIndex.from_frame(cooccurrence[['query1', 'query2']]),
    columns=['1', '2', '3'],
    data=cooccurrence['top-3'].tolist()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,1,2,3
query1,query2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Graphics and Aesthetics,Atmosphere and Ambiance,Bird,Equilinox,Endling
Relaxation and Calmness,Atmosphere and Ambiance,Equilinox,Cloud Gardens,Distant Bloom
Map customization,Character customization,Bee Simulator,Timberborn,Wolf Quest:...
Enjoyment and Fun,Educational Value,Wolf Quest:...,Wolf Quest: Classic,
Realism,Educational Value,Wolf Quest:...,Bee Simulator,
Graphics and Aesthetics,Enjoyment and Fun,Wolf Quest: Classic,Timberborn,Wolf Quest:...
Multiplayer,Enjoyment and Fun,Wolf Quest: Classic,Eco,Wolf Quest:...
Realism,Enjoyment and Fun,Wolf Quest:...,Wolf Quest: Classic,Bee Simulator
Relaxation and Calmness,Enjoyment and Fun,Wolf Quest:...,Equilinox,APICO
Game pacing,Game length,Endling,Paperbark,Fishy 3D


In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

fig, axs = plt.subplots(1, 2, figsize=(18, 9))

sns.heatmap(
    cooccurrence.pivot(index='query1', columns='query2', values='n'),
    annot=True,
    cmap='Spectral_r',
    fmt='.0f',
    ax=axs[0],
    center=0
)

sns.heatmap(
    cooccurrence.pivot(index='query1', columns='query2', values='mean'),
    annot=True,
    fmt='.2f',
    robust=True,
    cmap='Spectral_r',
    ax=axs[1],
    center=0
)

axs[0].set_title('Number of co-occurrenced citations')
axs[1].set_title('Mean sentiment of co-occurrenced citations')
axs[0].set_ylabel('')
axs[0].set_xlabel('')
axs[1].set_ylabel('')
axs[1].set_xlabel('')

os.makedirs('output/img', exist_ok=True)
plt.tight_layout()
plt.savefig('output/img/cooccurrences.pdf', format='pdf')
plt.show()


In [11]:
def get_table(results, agg_fn, min_support=10):
    df = pd.DataFrame()
    for category, queries in results.items():
        for query, citations in queries.items():
            df[(category, query)] = agg_fn(citations)
    df.columns = pd.MultiIndex.from_tuples(df.columns, names=['category', 'query'])
    return df

In [None]:
get_table(results, lambda x: x['game'].value_counts()).sort_index()

In [None]:
def mean(x: pd.DataFrame, threshold=10):
    y = x.groupby('game', observed=False)['sentiment'].agg(['mean', 'count'])
    y['mean'] = y.apply(lambda x: np.nan if x['count'] < threshold else x['mean'], axis=1)
    return y['mean']
    

t = get_table(results, mean).sort_index()
t.columns = t.columns.droplevel(0)
t.dropna(axis=1, how='all', inplace=True)
t.dropna(axis=0, how='all', inplace=True)

# Sort columns by non-null values
t = t[t.count().sort_values(ascending=True).index]

# Sort index by non-null values
t = t.loc[t.count(axis=1).sort_values(ascending=False).index]

plt.figure(figsize=(16, 8))
sns.heatmap(t.T, annot=True, cmap='Spectral_r', fmt='.1f')
plt.title('Mean sentiment by game and query')
plt.xlabel('Game')
plt.ylabel('Query')
plt.show()