## 3. Purely Louvain Algorithm Implementation

In [1]:
import numpy as np
import pandas as pd

emb_norm = np.load("../data/embeddings_FINAL.npy") 
df = pd.read_pickle("../data/games_df_FINAL.pkl")

print(emb_norm.shape)
print(df.shape)

(10476, 384)
(10476, 11)


### 3.1 Construct Louvain's Starting Graph with weighted edges based in cosine similarity

In [2]:
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import time

# Cosine Sim
t0 = time.time()
# print("Calculating cosine sim...")
sim_matrix = cosine_similarity(emb_norm)
print("Cosine similarity done in:", time.time() - t0, "sec")


# Constructing Graph
threshold = 0.8  # Do not add all edges

t0 = time.time()
G = nx.Graph()

# Add all nodes (all videogames)
for i in range(len(df)):
    G.add_node(i)

# print("Building Edges...")
n = sim_matrix.shape[0]
count_edges = 0
for i in range(n):
    for j in range(i+1, n):
        sim = sim_matrix[i, j]
        if sim > threshold:
            G.add_edge(i, j, weight=float(sim))
            count_edges += 1
print("Graph Construction done in:", time.time() - t0, "sec")

print("Vertices:", G.number_of_nodes())
print("Edges:", G.number_of_edges()) 

# 0.8 threshold ->   2390 edges 40.49s
# 0.65 threshold ->  27442 edges 39.5s
# 0.60 threshold ->  63942 edges 38.6s
# 0.40 threshold ->  3010936 edges 72.2s


Cosine similarity done in: 2.5568110942840576 sec
Graph Construction done in: 20.258572578430176 sec
Vertices: 10476
Edges: 2390


### 3.2 Execute Louvain, Save & Analysis

In [4]:
import community as community_louvain

partition = community_louvain.best_partition(G, weight='weight')
modularity = community_louvain.modularity(partition, G)

print("Modularity:", modularity)
print("Nº Clusters:", len(set(partition.values())))

# With the dataset with DLCs: modularity of 0.2888, 23 clusters
# With cos 0.4: 0.2844 mod, 20 clusters
# With cos 0.6: 0.672 mod,  clusters 4265
# With cos 0.65: 0.730 mod,  clusters 6653
# With cos 0.8: 0.925 mod,  clusters 9627

Modularity: 0.9251854436091717
Nº Clusters: 9627


In [5]:
import pickle

with open("partition_louvain_puro.pkl", "wb") as f:
    pickle.dump(partition, f)

with open("graph_louvain_puro.pkl", "wb") as f:
    pickle.dump(G, f)

print("Saved Louvain outputs.")

Saved Louvain outputs.


#### 3.2.1 Louvain's Evaluation

In [6]:
from collections import Counter

cluster_sizes = Counter(partition.values())
print("Cluster size distribution:")
for cid, size in cluster_sizes.most_common():
    print(f"Cluster {cid}: {size} games")


Cluster size distribution:
Cluster 167: 65 games
Cluster 190: 55 games
Cluster 69: 46 games
Cluster 2277: 30 games
Cluster 786: 28 games
Cluster 413: 22 games
Cluster 2028: 22 games
Cluster 3311: 22 games
Cluster 263: 21 games
Cluster 354: 21 games
Cluster 1657: 19 games
Cluster 185: 17 games
Cluster 196: 17 games
Cluster 169: 16 games
Cluster 1606: 16 games
Cluster 2854: 12 games
Cluster 130: 10 games
Cluster 4985: 10 games
Cluster 570: 9 games
Cluster 1410: 9 games
Cluster 685: 8 games
Cluster 2270: 8 games
Cluster 4731: 7 games
Cluster 426: 6 games
Cluster 1489: 6 games
Cluster 119: 5 games
Cluster 176: 5 games
Cluster 258: 5 games
Cluster 265: 5 games
Cluster 959: 5 games
Cluster 1258: 5 games
Cluster 2151: 5 games
Cluster 2324: 5 games
Cluster 2601: 5 games
Cluster 2981: 5 games
Cluster 3577: 5 games
Cluster 70: 4 games
Cluster 208: 4 games
Cluster 328: 4 games
Cluster 433: 4 games
Cluster 765: 4 games
Cluster 1039: 4 games
Cluster 1046: 4 games
Cluster 1121: 4 games
Cluster 1713:

In [16]:
def average_intra_cluster_similarity(sim_matrix, partition):
    clusters = {}
    for node, cid in partition.items():
        clusters.setdefault(cid, []).append(node)

    intra_sims = {}

    for cid, nodes in clusters.items():
        if len(nodes) < 2:
            intra_sims[cid] = 0
            continue

        sims = []
        for i in range(len(nodes)):
            for j in range(i + 1, len(nodes)):
                sims.append(sim_matrix[nodes[i], nodes[j]])

        intra_sims[cid] = np.mean(sims)

    return intra_sims

intra = average_intra_cluster_similarity(sim_matrix, partition)

print("\nAverage intra-cluster similarity:")
for cid, sim in sorted(intra.items(), key=lambda x: -x[1])[:10]:
    print(f"Cluster {cid}: {sim:.4f}")



Average intra-cluster similarity:
Cluster 5984: 1.0000
Cluster 4830: 0.9995
Cluster 6742: 0.9930
Cluster 4998: 0.9877
Cluster 5034: 0.9876
Cluster 1490: 0.9840
Cluster 5753: 0.9819
Cluster 4840: 0.9806
Cluster 4407: 0.9774
Cluster 3625: 0.9715


### Visual analysis for Louvains algorithm

In [7]:
def inspect_cluster(cluster_id, df, partition, n=50):
    games = [i for i, c in partition.items() if c == cluster_id]
    return df.iloc[games].head(n)[["game_id", "name", "genres", "summary", ]]


In [8]:
inspect_cluster(69, df, partition)

Unnamed: 0,game_id,name,genres,summary
69,343605,Hentai Beach,"Indie, Puzzle",Hentai Beach - our warm resort with hot beauties.
229,161575,Hentai Fantasy,"Indie, Strategy",Enjoy the beauty girls picture and relaxing mu...
1251,197305,Naughty Waifu,Indie,Move all parts to form the complete image.
1317,265767,Hentai Vivian,Indie,What could be more beautiful than hentai?
1819,286722,Hentai Karada,"Indie, Strategy",Enjoy the beauty girls picture and relaxing mu...
1887,286726,Hentai Sakyubus,"Indie, Strategy",Enjoy the beauty girls picture and relaxing mu...
2038,286723,Hentai Hentai,"Indie, Strategy",Enjoy the beauty girls picture and relaxing mu...
2638,173853,Neko Hentai Girl,"Adventure, Indie",Only in this game you get AAA-quality nude pic...
3816,156637,Hentai Balloons 2,Indie,Gameplay: Move All Parts to Form the Complete ...
3829,177629,Hentai Clouds,Puzzle,Move the puzzle pieces to complete the drawing.


### Louvains algorithm implementation with different thresholds and evaluation

In [10]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score, davies_bouldin_score
import community as community_louvain
# import umap
import matplotlib.pyplot as plt
import time

emb_norm = np.load("../data/embeddings_FINAL.npy") 
df = pd.read_pickle("../data/games_df_FINAL.pkl")

print("Embeddings shape:", emb_norm.shape)
print("Dataframe shape:", df.shape)

# Compute cosine similarity
t0 = time.time()
sim_matrix = cosine_similarity(emb_norm)
print("Cosine similarity done in:", time.time() - t0, "sec")

# Thresholds to test
thresholds = [0.4, 0.6, 0.65, 0.8]
results = []

for threshold in thresholds:
    print("\n=== Threshold:", threshold, "===")
    t0 = time.time()
    
    # Construct Graph
    G = nx.Graph()
    for i in range(len(df)):
        G.add_node(i)
    
    n = sim_matrix.shape[0]
    count_edges = 0
    for i in range(n):
        for j in range(i+1, n):
            sim = sim_matrix[i, j]
            if sim > threshold:
                G.add_edge(i, j, weight=float(sim))
                count_edges += 1
    print("Graph construction done in:", time.time() - t0, "sec")
    print("Vertices:", G.number_of_nodes())
    print("Edges:", G.number_of_edges())
    
    # Louvain partition
    partition = community_louvain.best_partition(G, weight='weight')
    modularity = community_louvain.modularity(partition, G)
    n_clusters = len(set(partition.values()))
    
    print("Modularity:", modularity)
    print("Number of clusters:", n_clusters)
    
    # Metrics
    labels = np.array([partition[i] for i in range(len(df))])
    try:
        silhouette = silhouette_score(emb_norm, labels)
        db_score = davies_bouldin_score(emb_norm, labels)
    except:
        silhouette = np.nan
        db_score = np.nan
    print("Silhouette Score:", silhouette)
    print("Davies-Bouldin Score:", db_score)
    
    results.append({
        "threshold": threshold,
        "edges": G.number_of_edges(),
        "modularity": modularity,
        "n_clusters": n_clusters,
        "silhouette": silhouette,
        "davies_bouldin": db_score
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\n=== Summary Table ===")
print(results_df)

# # Visualization: UMAP projection of communities
# # Choose the threshold you want to visualize
# best_threshold = 0.65
# best_partition = None
# for res in results:
#     if res["threshold"] == best_threshold:
#         best_partition = community_louvain.best_partition(
#             nx.Graph([(i, j, {'weight': sim_matrix[i,j]}) for i in range(len(df)) for j in range(i+1, len(df)) if sim_matrix[i,j] > best_threshold]), 
#             weight='weight'
#         )
# labels = np.array([best_partition[i] for i in range(len(df))])

# reducer = umap.UMAP(n_components=2, random_state=42)
# embedding_2d = reducer.fit_transform(emb_norm)

# plt.figure(figsize=(10,7))
# scatter = plt.scatter(embedding_2d[:,0], embedding_2d[:,1], c=labels, cmap='tab20', s=15)
# plt.title(f"UMAP projection of games colored by Louvain communities (threshold={best_threshold})")
# plt.colorbar(scatter, label='Community ID')
# plt.show()


Embeddings shape: (10476, 384)
Dataframe shape: (10476, 11)
Cosine similarity done in: 0.9444174766540527 sec

=== Threshold: 0.4 ===
Graph construction done in: 26.740947723388672 sec
Vertices: 10476
Edges: 3010936
Modularity: 0.28449779297898115
Number of clusters: 20
Silhouette Score: 0.017398551106452942
Davies-Bouldin Score: 1.9665367925952038

=== Threshold: 0.6 ===
Graph construction done in: 19.83684277534485 sec
Vertices: 10476
Edges: 63942
Modularity: 0.672569627962403
Number of clusters: 4265
Silhouette Score: -0.047659993171691895
Davies-Bouldin Score: 0.9933494381088941

=== Threshold: 0.65 ===
Graph construction done in: 23.994199514389038 sec
Vertices: 10476
Edges: 27442
Modularity: 0.7302331155083333
Number of clusters: 6651
Silhouette Score: 0.0005958016845397651
Davies-Bouldin Score: 0.9316109504403466

=== Threshold: 0.8 ===
Graph construction done in: 20.519299268722534 sec
Vertices: 10476
Edges: 2390
Modularity: 0.9251854436091719
Number of clusters: 9627
Silhouett

### 3.4 Recommendation Sytem with Louvains Graph

In [None]:
# Añadir aqui el mejor modelo de todos y construir el recomendador con ello

In [None]:
_partition_cache = None
_graph_cache = None

def recommend_pure_louvain(game_id, embeddings_norm, df, topk=10):

    global _partition_cache, _graph_cache

    # Load partition
    if _partition_cache is None:
        print("Loading Louvain partition...")
        with open("partition_louvain_puro.pkl", "rb") as f:
            _partition_cache = pickle.load(f)

    # Load graph
    if _graph_cache is None:
        print("Loading Louvain graph...")
        with open("graph_louvain_puro.pkl", "rb") as f:
            _graph_cache = pickle.load(f)

    partition = _partition_cache
    G = _graph_cache
    cluster_id = partition[game_id]
    same_cluster = [
        j for j, c in partition.items()
        if c == cluster_id and j != game_id
    ]

    if len(same_cluster) == 0:
        print("Game is isolated; no recommendations.")
        return []

    sims = [
        (j, float(np.dot(embeddings_norm[game_id], embeddings_norm[j])))
        for j in same_cluster
    ]

    sims.sort(key=lambda x: x[1], reverse=True)
    return sims[:topk]

### 3.5 Example of Recommendation by Purely Louvain

In [None]:
game_id = 42
print("Base game:", df.iloc[game_id]["name"])
recs = recommend_pure_louvain(game_id, emb_norm, df)
for idx, sim in recs:
    print(f"{df.iloc[idx]['name']} – sim={sim:.3f}")