In [None]:
import os
import gzip
import json
import pandas as pd
import numpy as np
from collections import defaultdict
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
import pandas as pd
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment import SentimentIntensityAnalyzer

import netwulf as nw
import networkx as nx
from networkx.readwrite import json_graph

import community as community_louvain
import matplotlib.pyplot as plt



In [4]:
with open("network.json", "r") as f:
    graph_data = json.load(f)

# Convert the data back into a NetworkX graph object
Graph = json_graph.node_link_graph(graph_data)

The default value will be changed to `edges="edges" in NetworkX 3.6.


  nx.node_link_graph(data, edges="links") to preserve current behavior, or
  nx.node_link_graph(data, edges="edges") for forward compatibility.


In [5]:
num_nodes = Graph.number_of_nodes()
num_edges = Graph.number_of_edges()

#Solve for p using formula:
Lmax = num_nodes * (num_nodes - 1) / 2  
p = num_edges / Lmax  
print(f"Calculated probability p: {p}")

#Find k using formula
average_degree = (2 * num_edges) / num_nodes
print(f"Average degree: {average_degree}")

Calculated probability p: 0.004498466456455661
Average degree: 154.4368519165793


In [6]:
critical_point = 1/num_nodes
print(critical_point)
p > critical_point

2.9127344751252476e-05


True

In [7]:
np.log(num_nodes)

10.44383314283811

In [9]:
#list of connected components:
connected_components = list(nx.connected_components(Graph))
#Finding the biggest set, from the list:
largest_component = max(connected_components, key=len)
giant_component = Graph.subgraph(largest_component)

import random
#Algortihm that approximates the shortest path lengths:
def sample_shortest_paths(graph, num_samples):
    nodes = list(graph.nodes())
    total_path_length = 0
    count = 0
    
    for _ in range(num_samples):
        node1, node2 = random.sample(nodes, 2)
        try:
            path_length = nx.shortest_path_length(graph, source=node1, target=node2)
            total_path_length += path_length
            count += 1
        except nx.NetworkXNoPath:
            continue  

    if count > 0:
        return total_path_length / count
    else:
        return float('inf') 

In [11]:
approx_average_path = sample_shortest_paths(giant_component, 10000)
print("Approximated average shortest path", approx_average_path)

Approximated average shortest path 2.7099


In [12]:
def average_dist_approximation(graph):
    N = graph.number_of_nodes()
    avg_degree = sum(dict(graph.degree()).values()) / N
    avg_shortest_path = np.log(N) / np.log(avg_degree)
    return avg_shortest_path

approx_shortest_path_real = average_dist_approximation(giant_component)

print(f"Approx distance in the Soc Sci Network: {approx_shortest_path_real}")

real_clustering = nx.average_clustering(Graph)
print(f"Global Clustering Coefficient of Real Network: {real_clustering}")

Approx distance in the Soc Sci Network: 2.0713891994765508
Global Clustering Coefficient of Real Network: 0.26457522757771434


In [16]:
partition = community_louvain.best_partition(Graph)

In [35]:
communities = defaultdict(list)
for node, comm_id in partition.items():
    communities[comm_id].append(node)

for comm_id, members in communities.items():
    print(f"Community {comm_id}: {members}")

Community 11: ['B0002J58ME', 'B001F76OKC', 'B00147FVKW', 'B0042FDCMW', 'B00B5LIE4C', 'B003X2O79W', 'B00BSB9NDK', 'B006IRQU5S', 'B001OSC4FG', 'B00612FR1I', 'B003ULW71Q', 'B000U0C9UE', 'B00012FNBI', 'B000BR4UQ2', 'B000I2JCB8', 'B00012FNB8', 'B003AFPWI2', 'B07P67V245', 'B0089PCXAS', 'B0009WFFPO', 'B00R8HWMEU', 'B00KE77JJ4', 'B005BX3O0O', 'B00049QPII', 'B00FRPK4GI', 'B00HUAGZKW', 'B00247XURC', 'B00KASQQTQ', 'B000RL21OA', 'B003HC9JJ6', 'B00429C1QQ', 'B08MV1RJXG', 'B08M2M6WK2', 'B07K7SSSPM', 'B07KGJX6N7', 'B07KN9TM7K', 'B07KNB6D5F', 'B07G3KRYZK', 'B07DS8M656', 'B076ZS4LLX', 'B0765925D8', 'B00B2TWDWG', 'B00AZ2N906', 'B00IIOLQXU', 'B00QX5FAQA', 'B01GKKI38Y', 'B01BKTG026', 'B002LYD2M6', 'B0044CM7YC', 'B00008DDIW', 'B00FMO7QIS', 'B0001ZJQ72', 'B00PRABXFO', 'B0009WIY4S', 'B00FXCIS90', 'B00OZOIIYM', 'B00E8A0PNK', 'B0061S2T6S', 'B00R0HUEWK', 'B07DB1LKTH', 'B01M6ATZSG', 'B06Y12RK1V', 'B0012YN35M', 'B00007D00L', 'B00467182C', 'B00W85TAHE', 'B005GYXNT0', 'B000WC38CS', 'B00005N8AM', 'B0006BHP9M', 'B000

In [22]:
df = pd.read_csv("Merged_Reviews_and_Metadata.csv")

  df = pd.read_csv("Merged_Reviews_and_Metadata.csv")


In [None]:

community_titles = {}
asin_to_title = dict(zip(df['asin'], df['title_y']))

for comm_id, members in communities.items():
    # Skip single communities
    if len(members) < 2:
        continue

    titles = []
    for asin in members:
        if asin in asin_to_title:
            title = asin_to_title[asin]
            # Skip if the title is NaN
            if pd.isna(title):
                continue
            titles.append(title)

    # Only add the community if it has valid titles
    if titles:
        community_titles[comm_id] = titles

# Print out the movie titles for each community
for comm_id, titles in community_titles.items():
    print(f"Community {comm_id}:")
    for title in titles:
        print(f"  > {title}")
    print()

Community 11:
  > 10 Minute Solution: Pilates
  > Rodney Yee's Yoga for Beginners (Packaging May Vary)
  > Lots & Lots of Trucks Volume 1 - Hard Workin Trucks
  > Jane Fonda: Prime Time – Fit & Strong
  > Beachbody Shaun T's Rockin' Body DVD Workout, Dance Workout DVDs, Exercise Videos, Dancing Fitness Guide for Beginners, Seniors, Easy To Follow, Low Impact, 5 Workouts Included
  > Total Immersion Swimming: Perpetual Motion Freestyle in Ten Lessons
  > Kundalini Yoga: Green Energy of the Heart - All-In-One Workout (ALL LEVELS) by Ana Brett & Ravi Singh
  > Tricycle: Pick Your Ride Cycle with Mindy Mylrea
  > Cathe Friedrich Low Impact Cycle Max Indoor Cycling DVD - Use Cathe low impact Indoor cycling workouts for lower body sculpting, fat burning, weight loss and aerobic conditioning
  > Viniyoga Therapy for the Low Back, Sacrum & Hips with Gary Kraftsow
  > Yoga Total Tune-UP
  > Yoga Bliss Hips - with the New **Matrix** Menu Option - Ana Brett & Ravi Singh
  > Yoga Cleanse - Lighten