In [1]:
# Import Required Packages

import networkx as nx       # I used NetworkX to construct and analyze the network graph, including centrality and diameter.
import gzip                 # I had to use gzip because the dataset was compressed as a .gz file.
import json                 # I used json to convert each review from raw text into a readable Python dictionary.
import pandas as pd         # I used pandas to organize the review data into a structured table.

In [2]:
# Load the Amazon Beauty Review Dataset

# Each line in the file represents one review in JSON format.
# Since the dataset was compressed, I had to read it line-by-line using gzip
# and convert each review into a Python object.

with gzip.open("all-beauty-5.json.gz", "rt", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# After loading the raw review data, I converted it into a pandas DataFrame
# so that I could easily construct the network from structured columns.
df = pd.DataFrame(data)

In [3]:
# I created a bipartite network where:
# Reviewers represent one type of node.
# Products (ASINs) represent another type of node.
# Each review forms a connection between a reviewer and a product.

G = nx.from_pandas_edgelist(
    df,
    source="reviewerID",
    target="asin"
)

# I printed the total number of nodes and edges to understand
# the overall size and density of the network.
print("Total Nodes:", G.number_of_nodes())
print("Total Edges:", G.number_of_edges())

Total Nodes: 1076
Total Edges: 4092


In [4]:
# Isolate the Largest Connected Component

# Because real-world networks often contain disconnected pieces,
# I extracted the largest connected component to focus my analysis
# on the main structural core of the network.

largest_cc = max(nx.connected_components(G), key=len)
G_cc = G.subgraph(largest_cc)

# I calculated the graph diameter to measure the maximum distance
# between any two nodes in this main component.
print("Graph Diameter (Largest Component):", nx.diameter(G_cc))

Graph Diameter (Largest Component): 10


In [5]:
# Degree Centrality measures how many direct connections each node has.
# In this analysis:
# For products, it reflects how many reviewers reviewed that product.
# For reviewers, it reflects how many products they reviewed.

degree_centrality = nx.degree_centrality(G_cc)

# I displayed the six most central nodes to identify
# which actors are most structurally influential in the network.
print("\nTop 6 Nodes by Degree Centrality:")
for node, score in sorted(
        degree_centrality.items(),
        key=lambda x: x[1],
        reverse=True
)[:6]:
    print(node, round(score, 4))


Top 6 Nodes by Degree Centrality:
B0012Y0ZG2 0.8174
B000URXP6E 0.7959
B00006L9LC 0.4579
B001OHV1H4 0.4579
B0009RF9DW 0.3614
B000FI4S1E 0.3614


In [6]:
# Prepare Data for Visualization (Gephi Export)
# Create node classification table (Reviewer vs Product)

# To prepare the data for Gephi visualization,
# I created a node table to distinguish between reviewers and products.

nodes_df = pd.DataFrame({
    "Id": list(G_cc.nodes())
})

# Since Amazon product IDs typically begin with "B",
# I used that pattern to classify product nodes.
nodes_df["Type"] = nodes_df["Id"].apply(
    lambda x: "Product" if x.startswith("B") else "Reviewer"
)

# I exported this node table so I could color-code nodes in Gephi.
nodes_df.to_csv("beauty-nodes.csv", index=False)

In [7]:
# I also exported the edge list so that each reviewerâ€“product
# relationship could be visualized in Gephi.
edges_df = pd.DataFrame(G_cc.edges(), columns=["Source", "Target"])
edges_df.to_csv("beauty-network.csv", index=False)