This ipynb goal is to parse the amazon-meta.txt and generates products_copurchases_links.csv

In [68]:
import os
import numpy as np
import pandas as pd
import csv
import re
import sys

In [2]:
# meta_text_path = r"C:\Project_Files\VT\SecondSem\SMA\Project\datasets\amazon-meta.txt\amazon-meta.txt"

In [72]:
meta_text_path = os.path.join(os.getcwd(), r"..\amazon-meta.txt")

In [3]:
meta_file = open(meta_text_path, "r", encoding="utf8")
meta_lines = meta_file.readlines()

In [4]:
meta_file.seek(0)
meta_data = meta_file.read()

In [5]:
all_ids = [(m.start(0), m.end(0)) for m in re.finditer("Id:   \d+", meta_data)]

In [6]:
def get_id_number(line):
    m = re.search("Id:   (\d+)", line)
    if m:
        return m.group(1)
    else:
        return '-1'

In [7]:
def get_asin_number(line):
    m = re.search("ASIN: (\w+)", line)
    if m:
        return m.group(1)
    else:
        return '-1'

In [71]:
def get_title(id_content, start, end):
    data = id_content[start: end].strip("\n")
    m = re.search("title: ([A-Za-z0-9:\s\_\-\;\,\?\!\(\)\&]+)", data)
    if m:
        title = m.group(1).replace(",", " ")
        return title
    return '-1'

In [9]:
def get_group(id_content, start, end):
    data = id_content[start: end].strip("\n")
    m = re.search("group:\s+(\w+)", data)
    if m:
        return m.group(1)
    return '-1'

In [10]:
def get_salesrank(id_content, start, end):
    data = id_content[start: end].strip("\n")
    m = re.search("salesrank: (\w+)", data)
    if m:
        return m.group(1)
    return '-1'

In [11]:
def get_similar(id_content, start, end):
    data = id_content[start: end].strip("\n")
    sims = re.findall("(\w+)", data)
    return sims[2:]

In [12]:
def get_categories(id_content, start, end):
    data = id_content[start: end].strip("\n")
    cats = re.findall("(\w+)", data)
    num_cats = cats[1]
    cats_set = set()
    for cat in cats[2:]:
        if cat.isalpha():
            cats_set.add(cat)
    return int(num_cats), list(cats_set)

In [27]:
def get_reviews(id_content, start, end, get_rating=True):
    data = id_content[start: end].strip("\n")
    revs = data.split("\n")
    total, down, avg = re.search("reviews: total: (\d+)  downloaded: (\d+)  avg rating: ([0-9\.]+)", data).groups()
    # print(total, down, avg)
    usr_revs = []
    for idx in range(1, len(revs)):
        rev = revs[idx]
        cnts = re.findall('(\w+)', rev)
        date = f"{cnts[0]}-{cnts[1]}-{cnts[2]}"
        customer_id = cnts[4]
        rating = cnts[6]
        votes = cnts[8]
        helpful = cnts[10]
        usr_revs.append([date, customer_id, rating, votes, helpful, down, avg])
    if get_rating == True:
        return [len(usr_revs), down, avg]
    return usr_revs    
    # print(usr_revs)
        
    

In [25]:
def parse_id(id_content):
    data = id_content.split("\n")
    id_number = get_id_number(data[0])
    asin_number = get_asin_number(data[1])
    # check if product is discontinued
    if "discontinued product" in data[2]:
        return [False, id_number, asin_number]
    else:
        tags = ["title", "group", "salesrank", "similar", "categories", "reviews"]
        index = 2
        try:
            title_index = re.search("title: ", id_content).start()
            group_index = re.search("group: ", id_content).start()
            srank_index = re.search("salesrank: ", id_content).start()
            simil_index = re.search("similar: ", id_content).start()
            categ_index = re.search("categories: ", id_content).start()
            revie_index = re.search("reviews: ", id_content).start()
        except Exception as e:
            print(f"Issue with Id: {id_number}, error: {e}")
            return [False, id_number, asin_number]
        
        title = get_title(id_content, title_index, group_index)
        group = get_group(id_content, group_index, srank_index)
        salesrank = get_salesrank(id_content, srank_index, simil_index)
        similar = get_similar(id_content, simil_index, categ_index)
        categories = get_categories(id_content, categ_index, revie_index)
        reviews = get_reviews(id_content, revie_index, len(id_content), get_rating=True)
        
        # print(title_index, group_index, srank_index, simil_index, categ_index, revie_index)
        return [True, id_number, asin_number, title, group, salesrank, similar, categories, reviews]
            

In [21]:
last_idx = len(all_ids)

In [70]:
# print("id,title,group,salesrank,review_cnt,downloads,rating")
from collections import defaultdict

product_copurchase_adj_list = defaultdict(list)
id_asin_map = defaultdict(str)

with open("products_data.csv", "w") as outfile:
    outfile.write("id,asin,title,group,salesrank,review_cnt,downloads,rating\n")
    for id_idx in range(len(all_ids)-1):
        if id_idx != last_idx:
            id_text = meta_data[all_ids[id_idx][0]: all_ids[id_idx+1][0]]
        else:
            id_text = meta_data[all_ids[id_idx][0]: ]
            
        if id_idx % 1000 == 0:
            print(id_idx, end=" ")

        try:
            s2d = parse_id(id_text)
            id_asin_map[s2d[1]] = s2d[2]
            
            if s2d[0] == True:
                s2d[3] = s2d[3].strip()
                # print(f"{s2d[1]},{s2d[3]},{s2d[4]},{s2d[5]},{s2d[8][0]},{s2d[8][1]},{s2d[8][2]}")
                outfile.write(f"{s2d[1]},{s2d[2]},{s2d[3]},{s2d[4]},{s2d[5]},{s2d[8][0]},{s2d[8][1]},{s2d[8][2]}\n")
                for similar_prod in s2d[6]:
                    product_copurchase_adj_list[s2d[2]].append(similar_prod)
        except Exception as e:
            print(e)

    # for _ in product_copurchase_adj_list:
    #     print(_, product_copurchase_adj_list[_])
        
    # for _ in id_asin_map:
    #     print(_, id_asin_map[_])

0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 92000 93000 94000 95000 96000 97000 98000 99000 100000 101000 102000 103000 104000 105000 106000 107000 108000 109000 110000 111000 112000 113000 114000 115000 116000 117000 118000 119000 120000 121000 122000 123000 124000 125000 126000 127000 128000 129000 130000 131000 132000 133000 134000 135000 136000 137000 138000 139000 140000 141000 142000 143000 144000 145000 146000 147000 148000 149000 150000 151000 152000 153000 154000 155000 156000 157000 158000 

In [61]:
asin_id_map = {v: k for k, v in id_asin_map.items()}

In [65]:
with open("products_copurchases_links.csv", "w") as outfile:
    outfile.write("source,destination\n")
    err = 0
    
    for src, neighbors in product_copurchase_adj_list.items():
        try:
            src_id = asin_id_map[src]
            for neigh in neighbors:
                if neigh in asin_id_map:
                    neigh_id = asin_id_map[neigh]
                    outfile.write(f"{src_id},{neigh_id}\n")
        except:
            err += 1
            
print(err)
print("Done")

0
Done


In [67]:
id_asin_map['335158']

'B00000ADJO'