In [13]:
import pandas as pd
import pickle
import networkx as nx
import sys
import os

# Structural

In [14]:

sys.path.append(os.path.abspath('..'))
from src.features import get_pagerank, get_approx_betweenness, get_clustering_coefficient, get_harmonic_centrality

from src import load_or_compute

# 1. Load the graph from the pickle file
# Make sure the path is correct relative to where you run this script
with open("../data/processed/amazon_graph.pickle", "rb") as f:
    G_loaded = pickle.load(f)

recompute = False

In [15]:

df_handpagerank = load_or_compute("../data/processed/pagerank_scores.csv", get_pagerank , recompute , G = G_loaded, force_cpu = False)
# Now all functions (build_index_map, calculate_pagerank, etc.) are available in memory
#prscores = get_prscores(G_loaded)
display(sum(df_handpagerank["pagerank"].values))
#df_handpagerank = pd.DataFrame(list(prscores.items()), columns=['ASIN', 'MyHandPageRank'])
display(df_handpagerank.head(20))
# Save to CSV
#df_handpagerank.to_csv("../data/processed/handpagerank_scores.csv", index=False)

File found.


np.float64(0.9854412932845537)

Unnamed: 0_level_0,pagerank
ASIN,Unnamed: 1_level_1
0827229534,5.332977e-06
0738700797,9.438284e-06
0842328327,6.925232e-07
1577943082,1.400741e-06
0486220125,4.67182e-07
B00000AU3R,5.403461e-06
0231118597,3.781384e-06
0375709363,4.67182e-07
0871318237,4.67182e-07
1590770218,5.545402e-06


In [16]:


df_bet = load_or_compute("../data/processed/bet_scores.csv",get_approx_betweenness,recompute,G=G_loaded,k=1000) 
display(df_bet.head(5))
# Save to CSV



# Now you can use the scores
print(f"Computed bet cent for {len(df_bet)} nodes.")

File found.


Unnamed: 0_level_0,bet
ASIN,Unnamed: 1_level_1
827229534,0.0
738700797,0.0
842328327,0.0
1577943082,0.0
486220125,0.0


Computed bet cent for 334843 nodes.


In [17]:


df_clus = load_or_compute("../data/processed/clus_scores.csv",get_clustering_coefficient,recompute,G = G_loaded)
display(df_clus.head(5))

# Now you can use the scores
print(f"Computed cc for {len(df_bet)} nodes.")

File found.


Unnamed: 0_level_0,clus
ASIN,Unnamed: 1_level_1
827229534,0.0
738700797,0.027501
842328327,0.0
1577943082,0.0
486220125,0.0


Computed cc for 334843 nodes.


In [18]:
# Compute Harmonic scores and save them to a csv file
df_harmonic_scores = load_or_compute("../data/processed/harm_scores.csv",get_harmonic_centrality,False,G= G_loaded, version="GPU")
#df_harmonic_scores.to_csv("../data/processed/harm_scores.csv", index=False)

display(df_harmonic_scores.head(5))
print(f"Computed hc for {len(df_harmonic_scores)} nodes.")

File found.


Unnamed: 0_level_0,HarmonicCentrality
ASIN,Unnamed: 1_level_1
827229534,10890.978
738700797,9056.977
842328327,2.840921
1577943082,5.111011
486220125,0.0


Computed hc for 334843 nodes.


### Now we work on df


In [19]:

#df_pagerank = pd.DataFrame.from_dict(pagerank_scores, orient='index', columns=['PageRank'])
#df_clus = pd.DataFrame.from_dict(clus_scores, orient='index', columns=['ClusteringCoeff'])
#df_bet = pd.DataFrame.from_dict(bet_scores, orient='index', columns=['Betweenness'])


data_frames = [df_handpagerank,df_clus,df_bet,df_harmonic_scores]

df_final = pd.concat(data_frames,axis=1)



We now add the salestrank score

In [20]:

salesrank_dict = nx.get_node_attributes(G_loaded, 'salesrank')

# Convert to DataFrame
df_salesrank = pd.DataFrame.from_dict(salesrank_dict, orient='index', columns=['salesrank'])
df_salesrank.index.name = 'ASIN'

# Join to df_final
df_final = df_final.join(df_salesrank)

# Check the result
display(df_final.head())

Unnamed: 0_level_0,pagerank,clus,bet,HarmonicCentrality,salesrank
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
827229534,5.332977e-06,0.0,0.0,10890.978,396585.0
738700797,9.438284e-06,0.027501,0.0,9056.977,168596.0
842328327,6.925232e-07,0.0,0.0,2.840921,631289.0
1577943082,1.400741e-06,0.0,0.0,5.111011,455160.0
486220125,4.67182e-07,0.0,0.0,0.0,188784.0


and then the review score

In [21]:
df_review = pd.read_csv("../data/processed/review_scores.csv")
df_review.set_index("ASIN", inplace=True)
df_review.head()

Unnamed: 0_level_0,rw_score,num_reviews
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1
827229534,5.0,2
738700797,4.136364,12
486287785,5.0,1
842328327,4.0,1
486220125,4.550296,17


In [27]:
df_final = pd.merge(
    df_final, 
    df_review[['rw_score']], 
    on='ASIN', 
    how='left'
)

In [28]:
df_final.head()

Unnamed: 0_level_0,pagerank,clus,bet,HarmonicCentrality,salesrank,rw_score_x,rw_score_y
ASIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
827229534,5.332977e-06,0.0,0.0,10890.978,396585.0,5.0,5.0
738700797,9.438284e-06,0.027501,0.0,9056.977,168596.0,4.136364,4.136364
842328327,6.925232e-07,0.0,0.0,2.840921,631289.0,4.0,4.0
1577943082,1.400741e-06,0.0,0.0,5.111011,455160.0,,
486220125,4.67182e-07,0.0,0.0,0.0,188784.0,4.550296,4.550296


In [None]:
df_final.to_csv("../data/processed/structural_metrics.csv")

## Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

features = ['pagerank','Betweenness','ClusteringCoefficient','HarmonicCentrality','salesrank','rw_score']
X=df_final[features]

#non ci dovrebbero essere NaN ma per sicurezza
X=X.fillna(0)

ModuleNotFoundError: No module named 'sklearn'

In [None]:
X.head()

Standardize

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=4)
clusters = kmeans.fit_predict(X_scaled)

df_final['cluster_kmeans']= clusters

#questo stampa quanti nodi per ogni gruppo
print(df_final['cluster_kmeans'].value_counts())

#maybe we can save it

confronto con gruppi 

In [None]:
#aggiungo il gruppo 
groups_dict = nx.get_node_attributes(G_loaded, 'group')

# 2. Converti in un DataFrame (assicurandoti che l'indice sia l'ASIN)
df_groups = pd.DataFrame.from_dict(groups_dict, orient='index', columns=['group'])
df_groups.index.name = 'ASIN'

# 3. Unisci al tuo df_final
# join Ã¨ intelligente: allinea automaticamente gli indici (ASIN)
df_final = df_final.join(df_groups)

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt

crosstab = pd.crosstab(df_final['cluster_kmeans'], df_final['group'])

print(crosstab)

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(crosstab, annot=True, fmt='d',cmap='YlGnBu')
plt.xlabel('Categoria Originale')
plt.ylabel('Cluster Assegnato')
plt.show()

# Embeddings 

TODO: call load or compute or a similar function
load the embeddings in a df

In [None]:
embedding_file_path = "../data/processed/embeddings.csv" 
df_embeddings = pd.read_csv(embedding_file_path, index_col=0)

print("embedding dimension:", df_embeddings.shape)
display(df_embeddings.head())


In [None]:
from sklearn.preprocessing import StandardScaler, normalize

X = df_embeddings.values

X_norm = normalize(X) 

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_norm)

df_embeddings['cluster_kmeans'] = clusters

print("\nConteggio Cluster:")
print(df_embeddings['cluster_kmeans'].value_counts())



In [None]:
df_final = df_final.join(df_embeddings['cluster_kmeans'], rsuffix='_emb')
df_final.head()

In [None]:
crosstab = pd.crosstab(df_final['cluster_kmeans_emb'], df_final['group'])

print(crosstab)

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(crosstab, annot=True, fmt='d',cmap='YlGnBu')
plt.xlabel('Categoria Originale')
plt.ylabel('Cluster Assegnato')
plt.show()

## Hybrid


In [None]:
data_frames = [df_handpagerank,df_clus,df_bet,df_harmonic_scores,df_salesrank]

df_struct = pd.concat(data_frames,axis=1)
df_struct.head()

In [None]:
scaler = StandardScaler()
X_struct = scaler.fit_transform(df_final[['pagerank', 'Betweenness', 'ClusteringCoefficient', 'HarmonicCentrality', 'salesrank']])

In [None]:
#ho ricaricaricato gli embeddings ma si puo usare il df di prima 
df_emb = pd.read_csv("../data/processed/embeddings.csv", index_col=0)

In [None]:
df_hybrid_raw = df_struct.join(df_emb, how='inner', lsuffix='_struct', rsuffix='_emb')

# Separiamo le colonne per poter applicare normalizzazioni diverse
cols_struct = df_struct.columns
cols_emb = df_emb.columns

# Se hai colonne 'non-feature' (come 'group' o 'title'), rimuovile dalle liste
cols_struct = [c for c in cols_struct if c not in ['group', 'title', 'ASIN']]
# cols_emb dovrebbe contenere solo numeri (0, 1, ... 127)

print(f"Feature Strutturali: {len(cols_struct)}")
print(f"Dimensioni Embedding: {len(cols_emb)}")
print(f"Totale Nodi Allineati: {len(df_hybrid_raw)}")



# Opzione A: MinMax Scaling (Tutto tra 0 e 1) - Spesso preferito per ibridi
#scaler = MinMaxScaler()

# Opzione B: StandardScaler (Media 0, Var 1)
scaler = StandardScaler()

# Applichiamo lo scaler all'intero dataset concatenato
# (Oppure puoi scalare le due parti separatamente se vuoi pesi diversi)
X_hybrid = scaler.fit_transform(df_hybrid_raw[cols_struct + list(cols_emb)].fillna(0))


df_hybrid_features = pd.DataFrame(
    X_hybrid, 
    index=df_hybrid_raw.index, 
    columns=list(cols_struct) + list(cols_emb)
)

display(df_hybrid_features.head())



In [None]:

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
clusters = kmeans.fit_predict(df_hybrid_features)

# Aggiungiamo i cluster al df originale per analisi
#df_results = df_hybrid_raw.copy()
df_final['cluster_hybrid'] = clusters

# Se hai la colonna 'group' (ground truth), analizza i risultati
if 'group' in df_final.columns:
    print("\nConfronto con Ground Truth:")
    print(pd.crosstab(df_final['cluster_hybrid'], df_final['group']))




In [None]:
df_final.head()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(crosstab, annot=True, fmt='d',cmap='YlGnBu')
plt.xlabel('Categoria Originale')
plt.ylabel('Cluster Assegnato')
plt.show()