# Exploring Amazon dataset
Link: https://amazon-reviews-2023.github.io/

In [None]:
import json
import pandas as pd
from tqdm import tqdm
import networkx as nx
from itertools import combinations, chain
from collections import Counter

Starting with "Movies and TV" 

## Extracting the dataset 

In [2]:
file_review = "Movies_and_TV.jsonl" # e.g., "All_Beauty.jsonl", downloaded from the `review` link above
with open(file_review, 'r') as fp:
    for line in fp:
        print(json.loads(line.strip()))
        break


{'rating': 5.0, 'title': 'Five Stars', 'text': "Amazon, please buy the show! I'm hooked!", 'images': [], 'asin': 'B013488XFS', 'parent_asin': 'B013488XFS', 'user_id': 'AGGZ357AO26RQZVRLGU4D4N52DZQ', 'timestamp': 1440385637000, 'helpful_vote': 0, 'verified_purchase': True}


In [3]:
# Function taken from https://github.com/hyp1231/AmazonReviews2023/blob/main/benchmark_scripts/kcore_filtering.py and editted to fit our needs
def load_ratings(file):
    inters = []
    with open(file, 'r') as fp:
        for line in tqdm(fp, desc='Load ratings'):
            try:
                dp = json.loads(line.strip())
                item, user, rating, time, text = dp['parent_asin'], dp['user_id'], dp['rating'], dp['timestamp'], dp['text']
                inters.append((user, item, float(rating), int(time), text))
            except ValueError:
                print(line)
    return inters

In [4]:
file_review = "Movies_and_TV.jsonl"

df_list = load_ratings(file_review)

Load ratings: 17328314it [02:26, 118424.10it/s]


In [5]:
df_review = pd.DataFrame(df_list,columns=["parent_asin","user_id","rating","timestamp","text"])
df_review.head()

Unnamed: 0,parent_asin,user_id,rating,timestamp,text
0,AGGZ357AO26RQZVRLGU4D4N52DZQ,B013488XFS,5.0,1440385637000,"Amazon, please buy the show! I'm hooked!"
1,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B00CB6VTDS,5.0,1461100610000,My Kiddos LOVE this show!!
2,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B096Z8Z3R6,3.0,1646271834582,Annabella Sciorra did her character justice wi...
3,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B09M14D9FZ,4.0,1645937761864,...there should be more of a range of characte...
4,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B001H1SVZC,5.0,1590639227074,"...isn't always how you expect it to be, but w..."


In [6]:
df_review.shape

(17328314, 5)

In [7]:
file_product = "meta_Movies_and_TV.jsonl"
with open(file_product, 'r') as fp:
    for line in fp:
        print(json.loads(line.strip()))
        break


{'main_category': 'Prime Video', 'title': 'Glee', 'subtitle': 'UnentitledUnentitled', 'average_rating': 4.7, 'rating_number': 2004, 'features': ['IMDb 6.8', '2013', '22 episodes', 'X-Ray', 'TV-14'], 'description': ['Entering its fourth season, this year the members of New Directions compete amongst themselves to be the "new Rachel" and hold auditions to find new students. Meanwhile, the graduating class leaves the comforts of McKinley where Rachel struggles to please her demanding NYADA teacher (Kate Hudson) and Kurt second-guesses his decision to stay in Lima. Four newcomers also join the musical comedy.'], 'price': 22.39, 'images': [{'360w': 'https://images-na.ssl-images-amazon.com/images/S/pv-target-images/8251ee0b9f888d262cd817a5f1aee0b29ffed56a4535af898b827292f881e169._RI_SX360_FMwebp_.jpg', '480w': 'https://images-na.ssl-images-amazon.com/images/S/pv-target-images/8251ee0b9f888d262cd817a5f1aee0b29ffed56a4535af898b827292f881e169._RI_SX480_FMwebp_.jpg', '720w': 'https://images-na.s

In [8]:
def load_meta(file):
    inters = []
    with open(file, 'r') as fp:
        for line in tqdm(fp, desc='Load ratings'):
            try:
                dp = json.loads(line.strip())
                item, category, avg_rating, n_ratings, title = dp['parent_asin'], dp['main_category'], dp['average_rating'], dp['rating_number'], dp['title'] # changed to relevant types
                if type(avg_rating) == type(None) or type(n_ratings) == type(None): # Removing all items without ratings
                    continue
                inters.append((item, category, float(avg_rating), int(n_ratings), title))
            except ValueError:
                print(line)
    return inters

In [9]:
file_product = "meta_Movies_and_TV.jsonl"
df_list2 = load_meta(file_product)

Load ratings: 748224it [00:17, 43269.06it/s]


In [10]:
df_product = pd.DataFrame(df_list2,columns=["parent_asin","main_category","average_rating","rating_number","title"])
df_product.head()

Unnamed: 0,parent_asin,main_category,average_rating,rating_number,title
0,B00ABWKL3I,Prime Video,4.7,2004,Glee
1,B09WDLJ4HP,Prime Video,3.0,6,One Perfect Wedding
2,B00AHN851G,Movies & TV,5.0,7,How to Make Animatronic Characters - Organic M...
3,B01G9ILXXE,Prime Video,4.3,35,Ode to Joy: Beethoven's Symphony No. 9
4,B009SIYXDA,Prime Video,4.7,360,Ben 10: Alien Force (Classic)


In [11]:
df_product.shape

(747978, 5)

In [12]:
df_merged = df_review.merge(df_product, left_on='user_id', right_on='parent_asin', how='inner')
df_merged.head()

Unnamed: 0,parent_asin_x,user_id,rating,timestamp,text,parent_asin_y,main_category,average_rating,rating_number,title
0,AGGZ357AO26RQZVRLGU4D4N52DZQ,B013488XFS,5.0,1440385637000,"Amazon, please buy the show! I'm hooked!",B013488XFS,Prime Video,4.6,56658,Sneaky Pete
1,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B00CB6VTDS,5.0,1461100610000,My Kiddos LOVE this show!!,B00CB6VTDS,Prime Video,4.8,6403,Creative Galaxy
2,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B096Z8Z3R6,3.0,1646271834582,Annabella Sciorra did her character justice wi...,B096Z8Z3R6,Prime Video,3.9,182,
3,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B09M14D9FZ,4.0,1645937761864,...there should be more of a range of characte...,B09M14D9FZ,Prime Video,4.8,533,
4,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B001H1SVZC,5.0,1590639227074,"...isn't always how you expect it to be, but w...",B001H1SVZC,Prime Video,4.5,389,


In [13]:
df_merged.shape

(17327788, 10)

In [14]:
df_merged["parent_asin_y"].nunique()

747533

In [17]:
df_merged["parent_asin_x"].nunique()

6503267

Me testing how it could work

In [18]:
df_merged[df_merged["parent_asin_x"] == "AG2L7H23R5LLKDKLBEF2Q3L2MVDA"]

Unnamed: 0,parent_asin_x,user_id,rating,timestamp,text,parent_asin_y,main_category,average_rating,rating_number,title
2,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B096Z8Z3R6,3.0,1646271834582,Annabella Sciorra did her character justice wi...,B096Z8Z3R6,Prime Video,3.9,182,
3,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B09M14D9FZ,4.0,1645937761864,...there should be more of a range of characte...,B09M14D9FZ,Prime Video,4.8,533,
4,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B001H1SVZC,5.0,1590639227074,"...isn't always how you expect it to be, but w...",B001H1SVZC,Prime Video,4.5,389,
5,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B06WVW16WY,5.0,1586999747540,As you learn about the very unique characters ...,B06WVW16WY,Prime Video,4.8,1966,


In [28]:
users = set(df_merged["parent_asin_x"].to_list())

In [None]:
for reviewer in users:
    mask = df_merged["parent_asin_x"] == reviewer
    movies = df_merged[mask]["user_id"].to_list()
    if len(movies) <= 1:
        continue

    pairs = list(combinations(movies,2))
    coauthor_count = Counter()
    for sublist in pairs: # incrementally count co-author pairs (increase efficiency)
        coauthor_count.update(sublist)
    
    edgelist = []
    for (a, b), count in coauthor_count.items():
        edgelist.append((a, b, count))

[('B00003CXQ7', 'B000056WRH')]
[('B001XUKFAQ', 'B00OCEDVW4')]
[('B004LQEYTC', 'B005VA7M2I'), ('B004LQEYTC', 'B009NE5B1M'), ('B004LQEYTC', 'B009S5VIQI'), ('B004LQEYTC', 'B0190TBYTW'), ('B004LQEYTC', 'B000W5R3QC'), ('B005VA7M2I', 'B009NE5B1M'), ('B005VA7M2I', 'B009S5VIQI'), ('B005VA7M2I', 'B0190TBYTW'), ('B005VA7M2I', 'B000W5R3QC'), ('B009NE5B1M', 'B009S5VIQI'), ('B009NE5B1M', 'B0190TBYTW'), ('B009NE5B1M', 'B000W5R3QC'), ('B009S5VIQI', 'B0190TBYTW'), ('B009S5VIQI', 'B000W5R3QC'), ('B0190TBYTW', 'B000W5R3QC')]
[('B08WLZ6G6M', 'B0748MXPXR')]
[('B00CVB6UXW', '6305729328'), ('B00CVB6UXW', 'B00D49YEHO'), ('B00CVB6UXW', 'B008RJ6TTC'), ('B00CVB6UXW', 'B005CGI4EG'), ('B00CVB6UXW', 'B00AEFYSEA'), ('6305729328', 'B00D49YEHO'), ('6305729328', 'B008RJ6TTC'), ('6305729328', 'B005CGI4EG'), ('6305729328', 'B00AEFYSEA'), ('B00D49YEHO', 'B008RJ6TTC'), ('B00D49YEHO', 'B005CGI4EG'), ('B00D49YEHO', 'B00AEFYSEA'), ('B008RJ6TTC', 'B005CGI4EG'), ('B008RJ6TTC', 'B00AEFYSEA'), ('B005CGI4EG', 'B00AEFYSEA')]
[('B0

KeyboardInterrupt: 

In [None]:
df_merged.groupby("parent_asin_x")["user_id"].count() # so this is the count of user ids for each movie


parent_asin_x
AE2222FRPDMNOMYOMCWIANTXP7UQ     9
AE22236AFRRSMQIKGG7TPTB75QEA    41
AE2224D3S4GTKVFJ5V7ZRQJ7P4FQ     2
AE2224GG4AXIQTTWVJXEU5ME7EIA     1
AE2225ACYIEFQ2MRLRSP2TBJ5PMA     1
                                ..
AHZZZYCUOTRYW4ZQFIFAEZGBOY4A     1
AHZZZZ76NI5YF4RP5TKCQRGRQAGA     1
AHZZZZPE45DYV2WZ2MYXZRHWSEKA     4
AHZZZZRW22XZ3MGJCPO7GBZWJ3HA     1
AHZZZZUZCRIUWGMXKFAJO3T5S45A     1
Name: user_id, Length: 6503267, dtype: int64

In [37]:
groups = df_merged.groupby("parent_asin_x")["user_id"].apply(list)
groups

KeyboardInterrupt: 

In [None]:
df = df_merged[df_merged.groupby("parent_asin_x")["user_id"].count() > 1]

In [None]:
# Group by 'parent_asin_x' and get sorted user_id lists (filter groups with ≥2 users)
grouped = (
    df_merged.groupby('parent_asin_x')['user_id']
    .apply(lambda g: sorted(g.tolist()))
    .loc[lambda x: x.str.len() >= 2]
)

In [None]:
all_pairs = chain.from_iterable(combinations(movies, 2) for movies in grouped)
coauthor_count = Counter(all_pairs)

# Convert to edgelist format (correct unpacking)


MemoryError: 

In [43]:
edgelist = [(a, b, count) for (a, b), count in coauthor_count.items()]

In [45]:
Graph = nx.Graph()
Graph.add_weighted_edges_from(edgelist)

In [47]:
Graph.number_of_nodes(), Graph.number_of_edges()

(686859, 187974782)