In [1]:
import os
import numpy as np
import pandas as pd
import csv
import re
import sys

In [5]:
meta_text_path = os.path.join(os.getcwd(), "datasets\amazon-meta.txt")

In [6]:
meta_file = open(meta_text_path, "r", encoding="utf8")
meta_lines = meta_file.readlines()

In [11]:
meta_file.seek(0)
meta_data = meta_file.read()

In [12]:
all_ids = [(m.start(0), m.end(0)) for m in re.finditer("Id:   \d+", meta_data)]

In [19]:
def getIDnumber(l):
    return re.search("Id:   (\d+)", l).group(1) if re.search("Id:   (\d+)", l) else '-1'

def getASINnumber(l):
    return re.search("ASIN: (\w+)", l).group(1) if re.search("ASIN: (\w+)", l) else '-1'

def getTitle(id_content, start, end):
    pattern = re.compile("title: ([A-Za-z0-9:\s\_\-\;\,\?\!\(\)\&]+)")
    data = id_content[start: end].strip("\n")
    return pattern.search(data).group(1).replace(",", " ") if pattern.search(data) else '-1'

def getGroup(id_content, start, end):
    pattern = re.compile("group:\s+(\w+)")
    data = id_content[start: end].strip("\n")
    return pattern.search(data).group(1) if pattern.search(data) else '-1'

def getSalesrank(id_content, start, end):
    pattern = re.compile("salesrank: (\w+)")
    data = id_content[start: end].strip("\n")
    return pattern.search(data).group(1) if pattern.search(data) else '-1'

def getSimilar(id_content, start, end):
    data = id_content[start: end].strip("\n")
    return re.findall("\w+", data)[2:] if len(data) > 0 else []

def getCategories(id_content, start, end):
    data = id_content[start: end].strip("\n")
    cats = re.findall("\w+", data)
    num_cats = cats[1]
    cats_set = set(cat for cat in cats[2:] if cat.isalpha())
    return int(num_cats), list(cats_set)

def getReviews(id_content, start, end, get_rating=True):
    data = id_content[start: end].strip("\n")
    revs = data.split("\n")
    total, down, avg = re.search("reviews: total: (\d+)  downloaded: (\d+)  avg rating: ([0-9\.]+)", data).groups()
    usr_revs = []
    for rev in revs[1:]:
        cnts = re.findall('\w+', rev)
        date = f"{cnts[0]}-{cnts[1]}-{cnts[2]}"
        customer_id, rating, votes, helpful = cnts[4], cnts[6], cnts[8], cnts[10]
        usr_revs.append([date, customer_id, rating, votes, helpful, down, avg])
    if get_rating:
        return [len(usr_revs), down, avg]
    else:
        return usr_revs



In [21]:
def parse_id(id_content):
    data = id_content.split("\n")
    id_number = getIDnumber(data[0])
    asin_number = getASINnumber(data[1])
    # check if product is discontinued
    if "discontinued product" in data[2]:
        return [False, id_number, asin_number]
    
    try:
        title_index = id_content.index("title:")
        group_index = id_content.index("group:")
        srank_index = id_content.index("salesrank:")
        simil_index = id_content.index("similar:")
        categ_index = id_content.index("categories:")
        revie_index = id_content.index("reviews:")
    except ValueError as e:
        print(f"Issue with Id: {id_number}, error: {e}")
        return [False, id_number, asin_number]

    title = getTitle(id_content, title_index, group_index)
    group = getGroup(id_content, group_index, srank_index)
    salesrank = getSalesrank(id_content, srank_index, simil_index)
    similar = getSimilar(id_content, simil_index, categ_index)
    categories = getCategories(id_content, categ_index, revie_index)
    reviews = getReviews(id_content, revie_index, len(id_content), get_rating=True)

    return [True, id_number, asin_number, title, group, salesrank, similar, categories, reviews]


In [22]:
last_idx = len(all_ids)

In [23]:
from collections import defaultdict

product_copurchase_adj_list = defaultdict(list)
id_asin_map = defaultdict(str)

with open("products_data.csv", "w") as outfile:
    outfile.write("id,asin,title,group,salesrank,review_cnt,downloads,rating\n")
    for id_idx in range(len(all_ids)-1):
        if id_idx != last_idx:
            id_text = meta_data[all_ids[id_idx][0]: all_ids[id_idx+1][0]]
        else:
            id_text = meta_data[all_ids[id_idx][0]: ]
        if id_idx % 1000 == 0:
            print(id_idx, end=" ")

        try:
            parsed_data = parse_id(id_text)
            id_asin_map[parsed_data[1]] = parsed_data[2]
        
            if parsed_data[0]:
                parsed_data[3] = parsed_data[3].strip()
                id_row = f"{parsed_data[1]},{parsed_data[2]},{parsed_data[3]},{parsed_data[4]},{parsed_data[5]},{parsed_data[8][0]},{parsed_data[8][1]},{parsed_data[8][2]}\n"
                outfile.write(id_row)
                for similar_prod in parsed_data[6]:
                    product_copurchase_adj_list[parsed_data[2]].append(similar_prod)
        except Exception as e:
            print(f"Error with Id: {parsed_data[1]}, {e}")


0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 92000 93000 94000 95000 96000 97000 98000 99000 100000 101000 102000 103000 104000 105000 106000 107000 108000 109000 110000 111000 112000 113000 114000 115000 116000 117000 118000 119000 120000 121000 122000 123000 124000 125000 126000 127000 128000 129000 130000 131000 132000 133000 134000 135000 136000 137000 138000 139000 140000 141000 142000 143000 144000 145000 146000 147000 148000 149000 150000 151000 152000 153000 154000 155000 156000 157000 158000 

In [24]:
asin_id_map = {}
for k, v in id_asin_map.items():
    asin_id_map[v] = k

In [25]:
with open("products_copurchases_links.csv", "w") as outfile:
    outfile.write("source,destination\n")
    num_errors = 0
    
    for source, neighbors in product_copurchase_adj_list.items():
        try:
            source_id = asin_id_map.get(source)
            if source_id is None:
                continue
                
            for neighbor in neighbors:
                neighbor_id = asin_id_map.get(neighbor)
                if neighbor_id is not None:
                    outfile.write(f"{source_id},{neighbor_id}\n")
        except Exception as e:
            print(f"Error processing source {source}: {e}")
            num_errors += 1
            
    print(f"{num_errors} errors encountered during processing.")
    print("Done")

0 errors encountered during processing.
Done


In [26]:
id_asin_map['335158']

'B00000ADJO'

In [27]:
dataset_path = os.path.join(os.getcwd(), "datasets")
print(dataset_path)

/Users/rana/Documents/School/Spring23/Social Media Analytics-CS5664/Homeworks/Project/datasets


In [28]:
os.listdir(dataset_path)

['com-amazon.all.dedup.cmty.txt',
 '.DS_Store',
 'amazon-meta.txt.gz',
 'amazon-meta.txt',
 'com-amazon.top5000.cmty.txt',
 'Amazon0302.txt',
 'com-amazon.ungraph.txt']

In [29]:
prod_path = os.path.join(dataset_path, "products_data.csv")
prod_graph_path = os.path.join(dataset_path, "products_copurchases_links.csv")

In [31]:
prod_path

'/Users/rana/Documents/School/Spring23/Social Media Analytics-CS5664/Homeworks/Project/datasets/products_data.csv'

In [33]:
prod_graph_path

'/Users/rana/Documents/School/Spring23/Social Media Analytics-CS5664/Homeworks/Project/datasets/products_copurchases_links.csv'

In [36]:
df = pd.read_csv(prod_path, encoding = 'unicode_escape')
df.head(5)

Unnamed: 0,id,asin,title,group,salesrank,review_cnt,downloads,rating
0,1,827229534,Patterns of Preaching: A Sermon Sampler,Book,396585,2,2,5.0
1,2,738700797,Candlemas: Feast of Flames,Book,168596,12,12,4.5
2,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,1270652,1,1,5.0
3,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,Book,631289,1,1,4.0
4,5,1577943082,Prayers That Avail Much for Business: Executive,Book,455160,0,0,0.0


In [37]:
network_df = pd.read_csv(prod_graph_path, encoding = 'unicode_escape')
network_df.head(5)

Unnamed: 0,source,destination
0,1,161555
1,1,244916
2,1,118052
3,1,444232
4,1,500600


In [38]:
df.shape

(542683, 8)

In [39]:
df["group"].unique()

array(['Book', 'Music', 'DVD', 'Video', 'Toy', 'Software', 'Baby', 'CE',
       'Sports', 'A'], dtype=object)

In [40]:
df["group"].value_counts()

Book        393560
Music       103143
Video        26132
DVD          19828
Toy              8
Software         5
CE               4
A                1
Sports           1
Baby             1
Name: group, dtype: int64