# Exploring Amazon dataset
Link: https://amazon-reviews-2023.github.io/

In [None]:
import json
import pandas as pd
from tqdm import tqdm
import itertools
import networkx as nx

Starting with "Movies and TV" 

## Extracting the dataset 

In [7]:
file_review = "Movies_and_TV.jsonl" # e.g., "All_Beauty.jsonl", downloaded from the `review` link above
with open(file_review, 'r') as fp:
    for line in fp:
        print(json.loads(line.strip()))
        break


{'rating': 5.0, 'title': 'Five Stars', 'text': "Amazon, please buy the show! I'm hooked!", 'images': [], 'asin': 'B013488XFS', 'parent_asin': 'B013488XFS', 'user_id': 'AGGZ357AO26RQZVRLGU4D4N52DZQ', 'timestamp': 1440385637000, 'helpful_vote': 0, 'verified_purchase': True}


In [None]:
# Function taken from https://github.com/hyp1231/AmazonReviews2023/blob/main/benchmark_scripts/kcore_filtering.py and editted to fit our needs
def load_ratings(file):
    inters = []
    with open(file, 'r') as fp:
        for line in tqdm(fp, desc='Load ratings'):
            try:
                dp = json.loads(line.strip())
                item, user, rating, time, text = dp['parent_asin'], dp['user_id'], dp['rating'], dp['timestamp'], dp['text']
                inters.append((user, item, float(rating), int(time), text))
            except ValueError:
                print(line)
    return inters

In [9]:
file_review = "Movies_and_TV.jsonl"

df_list = load_ratings(file_review)

Load ratings: 17328314it [01:52, 153715.51it/s]


In [10]:
df_review = pd.DataFrame(df_list,columns=["parent_asin","user_id","rating","timestamp","text"])
df_review.head()

Unnamed: 0,parent_asin,user_id,rating,timestamp,text
0,AGGZ357AO26RQZVRLGU4D4N52DZQ,B013488XFS,5.0,1440385637000,"Amazon, please buy the show! I'm hooked!"
1,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B00CB6VTDS,5.0,1461100610000,My Kiddos LOVE this show!!
2,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B096Z8Z3R6,3.0,1646271834582,Annabella Sciorra did her character justice wi...
3,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B09M14D9FZ,4.0,1645937761864,...there should be more of a range of characte...
4,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B001H1SVZC,5.0,1590639227074,"...isn't always how you expect it to be, but w..."


In [None]:
file_product = "meta_Movies_and_TV.jsonl"
with open(file_product, 'r') as fp:
    for line in fp:
        print(json.loads(line.strip()))
        break


{'main_category': 'Prime Video', 'title': 'Glee', 'subtitle': 'UnentitledUnentitled', 'average_rating': 4.7, 'rating_number': 2004, 'features': ['IMDb 6.8', '2013', '22 episodes', 'X-Ray', 'TV-14'], 'description': ['Entering its fourth season, this year the members of New Directions compete amongst themselves to be the "new Rachel" and hold auditions to find new students. Meanwhile, the graduating class leaves the comforts of McKinley where Rachel struggles to please her demanding NYADA teacher (Kate Hudson) and Kurt second-guesses his decision to stay in Lima. Four newcomers also join the musical comedy.'], 'price': 22.39, 'images': [{'360w': 'https://images-na.ssl-images-amazon.com/images/S/pv-target-images/8251ee0b9f888d262cd817a5f1aee0b29ffed56a4535af898b827292f881e169._RI_SX360_FMwebp_.jpg', '480w': 'https://images-na.ssl-images-amazon.com/images/S/pv-target-images/8251ee0b9f888d262cd817a5f1aee0b29ffed56a4535af898b827292f881e169._RI_SX480_FMwebp_.jpg', '720w': 'https://images-na.s

In [None]:
def load_meta(file):
    inters = []
    with open(file, 'r') as fp:
        for line in tqdm(fp, desc='Load ratings'):
            try:
                dp = json.loads(line.strip())
                item, category, avg_rating, n_ratings, title = dp['parent_asin'], dp['main_category'], dp['average_rating'], dp['rating_number'], dp['title'] # changed to relevant types
                if type(avg_rating) == type(None) or type(n_ratings) == type(None): # Removing all items without ratings
                    continue
                inters.append((item, category, float(avg_rating), int(n_ratings), title))
            except ValueError:
                print(line)
    return inters

In [28]:
file_product = "meta_Movies_and_TV.jsonl"
df_list2 = load_meta(file_product)

Load ratings: 748224it [00:11, 65214.77it/s]


In [32]:
df_product = pd.DataFrame(df_list2,columns=["parent_asin","main_category","average_rating","rating_number","title"])
df_product.head()

Unnamed: 0,parent_asin,main_category,average_rating,rating_number,title
0,B00ABWKL3I,Prime Video,4.7,2004,Glee
1,B09WDLJ4HP,Prime Video,3.0,6,One Perfect Wedding
2,B00AHN851G,Movies & TV,5.0,7,How to Make Animatronic Characters - Organic M...
3,B01G9ILXXE,Prime Video,4.3,35,Ode to Joy: Beethoven's Symphony No. 9
4,B009SIYXDA,Prime Video,4.7,360,Ben 10: Alien Force (Classic)


In [29]:
df_product.shape

(747978, 5)

## Constructing a Graph

In [None]:
G = nx.Graph()
movies = df_product['parent_asin'].unique()
# movies
G.add_nodes_from(movies)


In [None]:
# Create a dictionary to map movies to their reviewers
movie_reviewers = df_review.groupby('parent_asin')['user_id']\
                           .apply(lambda x: list(set(x)))\
                           .to_dict()
# Dictionary with reviewer as key and a set of 
movie_reviewers

{'AE2222FRPDMNOMYOMCWIANTXP7UQ': ['B071GW3JJP',
  'B01J2K3E8I',
  'B005LAIGYQ',
  'B00BEIYHT2',
  'B07KD6NDMX',
  'B01CRFKWXK',
  'B0094K20FK',
  'B0091X4AP8',
  'B017S3OP7A'],
 'AE22236AFRRSMQIKGG7TPTB75QEA': ['B003F3NDWG',
  'B001CTDH76',
  'B000CDGVOE',
  'B000M2E340',
  'B000Y5JFNE',
  'B0000648XE',
  'B002VVHMS4',
  'B004NS710C',
  'B002JVWRDG',
  'B004QYXA7G',
  'B000E6EK3S',
  'B002NU67U2',
  'B003LZ5B7S',
  'B009AMANH4',
  'B003Q6CX5A',
  'B005LAII0I',
  'B004W5T2RG',
  'B0027CT65E',
  'B002NHBNWW',
  'B0006FO9B0',
  'B002WGMZ62',
  'B00164GDD2',
  'B0006B2A2O',
  'B0053ZOQMK',
  'B001KZIRM2',
  '0800195175',
  'B002RHL8DC',
  'B002VPE1B6',
  'B006M0MKHI',
  'B001DPHDI8',
  'B001XRLWPQ',
  'B0006IIPH6',
  'B002ZCY7QY',
  'B0031DDG90',
  'B004Z2TUDW',
  'B00003CXC5',
  'B002RHI4T8',
  'B002U386JM',
  'B00CB6SU5I',
  'B00000K3D4',
  'B0024NSFYY'],
 'AE2224D3S4GTKVFJ5V7ZRQJ7P4FQ': ['B00K0Y8HLK', 'B003A8FEVO'],
 'AE2224GG4AXIQTTWVJXEU5ME7EIA': ['B01J94A5GQ'],
 'AE2225ACYIEFQ2MRLRSP

In [None]:
movies_filtered = [movie for movie in movies if movie in movie_reviewers]

for movie1, movie2 in itertools.combinations(movies_filtered, 2):
    # Get the common reviewers by taking the intersection.
    common_reviewers = movie_reviewers[movie1].intersection(movie_reviewers[movie2])
    weight = len(common_reviewers)
    if weight > 0:
        G.add_edge(movie1, movie2, weight=weight)


Something wrong - I am working on it

In [50]:
G.number_of_nodes(), G.number_of_edges()

(747978, 0)