In [1]:
import os
import gc
import json
import gzip
import pickle
import enchant
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [2]:
def read_amazon_dataset(filename, num_prods=3):
    data = []
    num_prods_cnt = 0
    with gzip.open(filename) as file:
        for line in file:
            if  num_prods is None or num_prods_cnt < num_prods:
                product = json.loads(line.strip())
                if product['also_buy'] or product['also_view'] or product['similar_item']:
                    data.append(product)
                    num_prods_cnt += 1
            else:
                break        
    return data


In [3]:
dataset_home = '/home/scai/phd/aiz218323/scratch/XML/amazon-review-data/'

In [4]:
amazon_file = f'{dataset_home}/datasets/All_Amazon_Meta.json.gz'

data = read_amazon_dataset(amazon_file, num_prods=100)

In [5]:
data = read_amazon_dataset(amazon_file, num_prods=100_000)

## Helper function

In [7]:
def dict_head_random(dictionary, n=10):
    keys = np.random.choice(list(dictionary.keys()), size=n)
    for k in keys:
        print(f'{k} : {dictionary[k]}')
        

## View data

Column headers : <br>
**category | 
tech1 |
description |
fit |
title |
also_buy |
image |
tech2 |
brand |
feature |
rank |
also_view |
details |
main_cat |
similar_item |
date |
price |
asin**

Columns under consideration : <br>
**also_buy | also_view | similar_item**

In [8]:
data[-10]

{'category': ['Clothing, Shoes & Jewelry', 'Men', 'Shoes', 'Oxfords'],
 'tech1': '',
 'description': ['Upper carefully crafted with calfskin leather. Fully cushioned footbed adds all-day comfort. Durable genuine leather outsole for a confident stride. Imported.',
  "In 1892, Milton Florsheim began producing shoes in a small factory located in Chicago, Illinois. The first pairs of Florsheim shoes made by Milton and his father, Sigmund, were a remarkable combination of style, comfort and high quality workmanship. This tradition continues today with the Florsheim family's commitment to producing trend-right, high quality footwear. Florsheim shoes are always available in a wide range of sizes and widths."],
 'fit': ' class="a-normal a-align-center a-spacing-small">\n                    \n                        <tr class="a-histogram-row">\n                            <td class="a-span3 a-nowrap">\n                                <span class="a-color-secondary">\n                          

In [156]:
for i, d in enumerate(data):
    also, cnt = np.unique(d['also_view'], return_counts=True)
    if len(np.where(cnt > 1)[0]):
        print(i)

## Extract similar items

### Exploration

In [8]:
prod_with_similar = []
for d in data:
    if d['similar_item']:
        prod_with_similar.append(d['similar_item'])

In [9]:
len(prod_with_similar)

2564

In [10]:
html_text = prod_with_similar[15]
print(html_text)

 class="a-bordered a-horizontal-stripes  a-spacing-extra-large a-size-base comparison_table">



            
            
            
            
            
            <tr class="comparison_table_image_row">
                <td class="comparison_table_first_col"></td>


                <th class="comparison_image_title_cell" role="columnheader">
                    <div class="a-row a-spacing-top-micro">
                        <center>
                             <img alt="Dogs Sterling Silver Loud Figural Dog WHISTLE Pendant" src="https://images-na.ssl-images-amazon.com/images/I/419XwGqfZIL._SL500_AC_SS350_.jpg" id="comparison_image">
                        </center>
                    </div>
                    <div class="a-row a-spacing-top-small">
                        <div id="comparison_title" class="a-section a-spacing-none">
                            <span aria-hidden="true" class="a-size-base a-color-base a-text-bold">
                                This item
 

In [11]:
soup = BeautifulSoup(html_text, 'html.parser')

In [12]:
header = soup.find('tr')

In [13]:
for th in header.find_all('th'):
    product_html = th.find('span')
    print(product_html)

<span aria-hidden="true" class="a-size-base a-color-base a-text-bold">
                                This item
                            </span>
<span class="a-size-base">.925 Sterling Silver Whistle Charm Pendant</span>
<span class="a-size-base">Sterling Silver Chihuahuas Dog Pendant</span>
<span class="a-size-base">Sterling Silver Chihuahuas Dog Pendant</span>


In [14]:
for th in header.find_all('th'):
    product_html = th.find('span')
    if len(product_html['class']) == 1 and product_html['class'][0] == 'a-size-base':
        print(product_html.get_text())

.925 Sterling Silver Whistle Charm Pendant
Sterling Silver Chihuahuas Dog Pendant
Sterling Silver Chihuahuas Dog Pendant


### Code

In [8]:
def extract_similar_items(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    header = soup.find('tr')
    
    similar_products = []
    
    if header:
        for th in header.find_all('th'):
            product_html = th.find('span')
            if len(product_html['class']) == 1 and product_html['class'][0] == 'a-size-base':
                similar_products.append(product_html.get_text())
                
    return similar_products


In [16]:
extract_similar_items(html_text)

['.925 Sterling Silver Whistle Charm Pendant',
 'Sterling Silver Chihuahuas Dog Pendant',
 'Sterling Silver Chihuahuas Dog Pendant']

In [17]:
extract_similar_items('')

[]

## Amazon product graph

In [27]:
class AmazonGraph:

    def __init__(self):
        self.graph = {}
        
    def add_product(self, prod_id, products):
        self.graph[prod_id] = products
        
    def save_data(self, save_dir, tag=''):
        os.makedirs(save_dir, exist_ok=True)
        filename = f'{save_dir}/amazon_graph{tag}.pickle'
        with open(filename, 'wb') as f:
            pickle.dump(self.graph, f)
            
        del self.graph
        gc.collect()
            
    def load_data(self, save_dir, tag=''):
        filename = f'{save_dir}/amazon_graph{tag}.pickle'
        if os.path.exists(filename):
            with open(filename, 'rb') as f:
                self.graph = pickle.load(f)
            return True
        print(f"ERROR:: Unable to load the graph at '{filename}'")
        return False
    
    def replace_duplicates(self, duplicates):
        delete_nodes = []

        for node in self.graph:
            if node in duplicates:
                delete_nodes.append(node)
            else:
                for i, edge in enumerate(self.graph[node]):
                    if edge in duplicates:
                        self.graph[node][i] = duplicates[edge]

        for node in delete_nodes:
            self.graph[duplicates[node]] = self.graph[node]
            del self.graph[node]
    

In [28]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)
        

In [38]:
class AmazonGraphContainer:
    
    def __init__(self):
        """
        Amazon graphs.
        """
        self.graphs = {}
        self.graphs['similar'] = AmazonGraph()
        self.graphs['also_buy'] = AmazonGraph()
        self.graphs['also_view'] = AmazonGraph()
        
        """
        Map from prod_id to title.
        """
        self.id_to_title = {}
        
        """
        Store product description.
        """
        self.description = {}
        
    def create_graph(self, filename, limit=None):
        
        for i, product in enumerate(parse(filename)):
            
            similar_items = extract_similar_items(product['similar_item'])
            
            if product['also_view'] or product['also_buy'] or len(similar_items):
                product_id = product['asin']
                
                self.id_to_title[product_id] = product['title']
                
                if product['also_view']:
                    self.graphs['also_view'].add_product(product_id, product['also_view'])
                if product['also_buy']:
                    self.graphs['also_buy'].add_product(product_id, product['also_buy'])
                if len(similar_items):
                    self.graphs['similar'].add_product(product_id, similar_items)
                    
                self.description[product_id] = product['description']
                
            if limit is not None and i > limit:
                break
                
    def replace_similar_graph_titles(self):
        nodes_to_delete = []
        title_to_id = { product_title:product_id for product_id, product_title in self.id_to_title.items() }
        
        for node, title_edges in self.graphs['similar'].graph.items():
            id_edges = []
            for edge in title_edges:
                if edge in title_to_id:
                    id_edges.append(title_to_id[edge])

            if not len(id_edges):
                nodes_to_delete.append(node)
            else:
                self.graphs['similar'].graph[node] = id_edges

        for node in nodes_to_delete:
            del self.graphs['similar'].graph[node]
            
    def replace_graph_duplicates(self, duplicates, graph_type='all'):
        graph_types = []
        if graph_type == "all":
            graph_types = ['similar', 'also_buy', 'also_view']
        else:
            graph_types.append(graph_type)
        
        for graph_type in graph_types:
            self.graphs[graph_type].replace_duplicates(duplicates)
                
    def save_graph(self, save_dir, tag='', graph_type='similar'):
        if graph_type != 'similar' and graph_type != 'also_buy' and graph_type != 'also_view':
            raise Exception("graph_type should be in ['similar', 'also_buy', 'also_view']")
            
        self.graphs[graph_type].save_data(save_dir, tag=f'_{graph_type}{tag}')
            
    def save_graphs(self, save_dir, tag=''):
        graph_types = ['similar', 'also_view', 'also_buy']
        
        for graph_type in graph_types:
            self.save_graph(save_dir, tag=tag, graph_type=graph_type)
              
    def save_idtotitle(self, save_dir, tag=''):
        map_file = f'{save_dir}/id_to_title{tag}.pickle'
        with open(map_file, 'wb') as f:
            pickle.dump(self.id_to_title, f)   
        del self.id_to_title
        gc.collect()
            
    def save_description(self, save_dir, tag=''):
        content_file = f'{save_dir}/description{tag}.pickle'
        with open(content_file, 'wb') as f:
            pickle.dump(self.description, f)
        del self.description
        gc.collect()
        
    def save_data(self, save_dir, tag=''):
        os.makedirs(save_dir, exist_ok=True)
        
        self.save_graphs(save_dir, tag)
        self.save_idtotitle(save_dir, tag)
        self.save_description(save_dir, tag)
            
    def load_graph(self, save_dir, tag='', graph_type='similar'):
        if graph_type != 'similar' and graph_type != 'also_buy' and graph_type != 'also_view':
            raise Exception("graph_type should be in ['similar', 'also_buy', 'also_view']")
            
        if not self.graphs[graph_type].load_data(save_dir, tag=f'_{graph_type}{tag}'):
            raise Exception(f"Unable to load '{graph_type} graph'.")
                
    def load_graphs(self, save_dir, tag=''):
        graph_types = ['similar', 'also_view', 'also_buy']
        
        for graph_type in graph_types:
            self.load_graph(save_dir, tag=tag, graph_type=graph_type)
        
    def load_idtotitle(self, save_dir, tag=''):
        map_file = f'{save_dir}/id_to_title{tag}.pickle'
        if os.path.exists(map_file):
            with open(map_file, 'rb') as f:
                self.id_to_title = pickle.load(f)
        else:
            raise Exception(f"Unable to load 'id_to_title' from '{map_file}'.")
            
    def load_description(self, save_dir, tag=''):
        content_file = f'{save_dir}/description{tag}.pickle'
        if os.path.exists(content_file):
            with open(content_file, 'rb') as f:
                self.description = pickle.load(f)
        else:
            raise Exception(f"Unable to load 'description' from '{content_file}'.")
            
    def load_data(self, save_dir, tag=''):
        self.load_graphs(save_dir, tag)
        self.load_idtotitle(save_dir, tag)
        self.load_description(save_dir, tag)
        

### Testing

In [114]:
save_dir = f'{dataset_home}/results'

In [118]:
amazon_graphs = AmazonGraphContainer()

In [119]:
amazon_graphs.create_graph(amazon_file, limit=1000)

In [117]:
amazon_graphs.save_data(save_dir)

In [120]:
amazon_graphs.also_view.graph

{'6342506256': ['B07CRJ95M7',
  'B008AHISU4',
  'B07B8F98W2',
  'B07DD98Q7R',
  'B0798KHL4Z',
  'B07CWV41SJ',
  'B07C74XYSC',
  'B079WTM1PX',
  'B07B4BJCYQ',
  'B00GAZMA2A',
  'B01JQU0Y8C',
  'B075MZ7YKV',
  'B006ZS3SL4',
  'B077VXP78B',
  'B07BDHB7Q3',
  'B07CR5R8W4',
  'B07BGXG1TD',
  'B07BZPGX98',
  'B01MT8MQBT',
  'B018MVYCMA',
  'B072BH2MB8',
  'B07BSLGQFG',
  'B00PZTVEMY',
  'B01MQ5YYHQ',
  'B01CGUMYK0',
  'B01A9CY8FQ',
  'B07D5T1Z1C',
  'B079BX5VX7',
  'B07CPXZPXH'],
 '6342509379': ['B07H2Z6S9J',
  'B077GQQKRV',
  'B072XTTTK9',
  'B002DMJOC8',
  'B07CBJQTF6',
  'B0156SZQ5O',
  'B07H2YZS5K',
  'B076B8J2TX',
  'B01CG5HYE6',
  'B071PFP967',
  'B06Y26PZ5R',
  'B06Y5N9YFF',
  'B012MKMMWO',
  'B0797PSJ4J',
  'B077TR2855',
  'B07CQJ8S9V',
  'B01E377QRU',
  'B07CMK5VRD',
  'B073P7PHCD',
  'B012KC7F2Q',
  'B017Y1GMR2',
  'B004EENYW4',
  'B0179AUNYQ',
  'B0757DW7XJ',
  'B019KYRQYO',
  'B0179ATZ9A',
  'B07CKDN5GS',
  'B01N5M3ZMG',
  'B01A3S8MLC',
  'B075KQPV28',
  'B07CQ4P1ZD',
  'B00CAKH5

#### __Loading__

In [121]:
graphs = AmazonGraphContainer()
graphs.load_data(save_dir)

In [122]:
graphs.also_view.graph

{'6342506256': ['B07CRJ95M7',
  'B008AHISU4',
  'B07B8F98W2',
  'B07DD98Q7R',
  'B0798KHL4Z',
  'B07CWV41SJ',
  'B07C74XYSC',
  'B079WTM1PX',
  'B07B4BJCYQ',
  'B00GAZMA2A',
  'B01JQU0Y8C',
  'B075MZ7YKV',
  'B006ZS3SL4',
  'B077VXP78B',
  'B07BDHB7Q3',
  'B07CR5R8W4',
  'B07BGXG1TD',
  'B07BZPGX98',
  'B01MT8MQBT',
  'B018MVYCMA',
  'B072BH2MB8',
  'B07BSLGQFG',
  'B00PZTVEMY',
  'B01MQ5YYHQ',
  'B01CGUMYK0',
  'B01A9CY8FQ',
  'B07D5T1Z1C',
  'B079BX5VX7',
  'B07CPXZPXH'],
 '6342509379': ['B07H2Z6S9J',
  'B077GQQKRV',
  'B072XTTTK9',
  'B002DMJOC8',
  'B07CBJQTF6',
  'B0156SZQ5O',
  'B07H2YZS5K',
  'B076B8J2TX',
  'B01CG5HYE6',
  'B071PFP967',
  'B06Y26PZ5R',
  'B06Y5N9YFF',
  'B012MKMMWO',
  'B0797PSJ4J',
  'B077TR2855',
  'B07CQJ8S9V',
  'B01E377QRU',
  'B07CMK5VRD',
  'B073P7PHCD',
  'B012KC7F2Q',
  'B017Y1GMR2',
  'B004EENYW4',
  'B0179AUNYQ',
  'B0757DW7XJ',
  'B019KYRQYO',
  'B0179ATZ9A',
  'B07CKDN5GS',
  'B01N5M3ZMG',
  'B01A3S8MLC',
  'B075KQPV28',
  'B07CQ4P1ZD',
  'B00CAKH5

In [123]:
amazon_graphs.description == graphs.description

True

In [124]:
amazon_graphs.description

{'6342506256': ["Gaok men's pants are all made from cotton materials,which provide you soft,cosy and breathable skin touch. they are in fashionable design with multi cargo pocket,loose style.they are perfect for sports or other leisure outdoor activities.Just choose one to yourself.<br> Fabric:100% cotton<br> Style:casual loose cargo pants<br> Package including:1*men's pants<br> <b>size(Unit:inch)</b><br> 26(Tag 28) Hip: 39.4 Waist: 27.3 Length: 21.7 <br> 27(Tag 29) Hip: 40.6 Waist: 28.6 Length: 21.7 <br> 28(Tag 30) Hip: 41.3 Waist: 29.9 Length: 22.4 <br> 29(Tag 31) Hip: 42.1 Waist: 31.2 Length: 22.8 <br> 30(Tag 32) Hip: 43.3 Waist: 32.5 Length: 22.8 <br> 32(Tag 34) Hip: 45.3 Waist: 35.1 Length: 23.6 <br> 34(Tag 36) Hip: 46.1 Waist: 36.4 Length: 24.4 <br> 36(Tag 38) Hip: 46.9 Waist: 37.7 Length: 24.8 <br>"],
 '6342509379': ['<b>pant size(Unit:inch)</b><br> W30(tag30) Waist: 30.0 Hip: 41.7 Length: 43.3 Thigh: 26.8 Leg opening: 16.5 <br> W32(tag32) Waist: 32.0 Hip: 43.7 Length: 43.7 Thig

In [125]:
graphs.description

{'6342506256': ["Gaok men's pants are all made from cotton materials,which provide you soft,cosy and breathable skin touch. they are in fashionable design with multi cargo pocket,loose style.they are perfect for sports or other leisure outdoor activities.Just choose one to yourself.<br> Fabric:100% cotton<br> Style:casual loose cargo pants<br> Package including:1*men's pants<br> <b>size(Unit:inch)</b><br> 26(Tag 28) Hip: 39.4 Waist: 27.3 Length: 21.7 <br> 27(Tag 29) Hip: 40.6 Waist: 28.6 Length: 21.7 <br> 28(Tag 30) Hip: 41.3 Waist: 29.9 Length: 22.4 <br> 29(Tag 31) Hip: 42.1 Waist: 31.2 Length: 22.8 <br> 30(Tag 32) Hip: 43.3 Waist: 32.5 Length: 22.8 <br> 32(Tag 34) Hip: 45.3 Waist: 35.1 Length: 23.6 <br> 34(Tag 36) Hip: 46.1 Waist: 36.4 Length: 24.4 <br> 36(Tag 38) Hip: 46.9 Waist: 37.7 Length: 24.8 <br>"],
 '6342509379': ['<b>pant size(Unit:inch)</b><br> W30(tag30) Waist: 30.0 Hip: 41.7 Length: 43.3 Thigh: 26.8 Leg opening: 16.5 <br> W32(tag32) Waist: 32.0 Hip: 43.7 Length: 43.7 Thig

### Create graph

In [13]:
save_dir = f'{dataset_home}/results'
amazon_graphs = AmazonGraphContainer()

#amazon_graphs.create_graph(amazon_file)
#amazon_graphs.save_data(save_dir)

### Visualize

In [14]:
amazon_graphs = AmazonGraphContainer()
amazon_graphs.load_idtotitle(save_dir)

In [15]:
print(f'Number of nodes : {len(amazon_graphs.id_to_title)}')

Number of nodes : 6638315


In [17]:
#amazon_graphs.load_graphs(save_dir)

In [16]:
amazon_graphs.load_graph(save_dir, graph_type='also_buy')
amazon_graphs.load_graph(save_dir, graph_type='also_view')
amazon_graphs.load_graph(save_dir, tag='_resolved', graph_type='similar')

In [18]:
len(amazon_graphs.graphs['similar'].graph), len(amazon_graphs.graphs['also_view'].graph), \
len(amazon_graphs.graphs['also_buy'].graph)

(1446517, 4526771, 3858772)

In [20]:
dict_head_random(amazon_graphs.graphs['also_view'].graph, n=3)

B003THGE6U : ['B00DQQ8PWK', 'B00IGZQGAY', 'B002ZTNWVS', 'B00IGZQT38', 'B014F69MVC', 'B00EE8K4XM', 'B00068YB88', 'B00K1JVSMY', 'B014F5JCEA', 'B00D6YO7IS', 'B003JH6IE8', 'B073ZNVD7L', 'B01A5TLGJ4', 'B000X21Z7W', 'B00FF8PGRY', 'B0084UIBOK', 'B0140LU3KQ', 'B0128J7CH4', 'B071XKV5G3', 'B00DQQ8SI6']
1449911471 : ['1452107084', '0399162313', '1537182285', '0692572074', '0692213287', '1599185830']
B012WTDGMA : ['B01BL7R3C8', 'B00U9TKVTC', 'B008TXWE18', 'B008PGMRHU', 'B0128OQG80', 'B015QN0WJO', 'B010TROKLI', 'B07HFJP7LX', 'B012WTD8JG', 'B00M74LG9Q']


In [26]:
dict_head_random(amazon_graphs.graphs['also_buy'].graph, n=3)

B00Z5YYTNK : ['B00Z5YYSN6', 'B00Z5YYTQC', 'B0039Z9HUW', 'B000PA09V0', 'B00Z5YXC60', 'B00Z5YX9A4', 'B00JRE163K', 'B00Z5YVH3A', 'B00Z5YVJ1A', 'B00Z5YVLZO', 'B00Z5YVG5Y', 'B00Z5YW554']
B00U5L4ZGY : ['B00TKNF3ZK', 'B00S1MX8GC', 'B01N5B08QH', '1680524267', 'B07BHT93YL', '1500812714', 'B01N4RFE6N', '0553520571', '1680524259', 'B009TJ3J60']
0762725958 : ['0312263686', '0762772875', '1935347578', '0771082568', '031263076X', '1771641770', '0816631425', 'B001HYKD8M', '0888012187']


In [25]:
dict_head_random(amazon_graphs.graphs['similar'].graph, n=3)

B004QL6BHU : ['B00XAYSQ7C', 'B00Y3H85K8']
B00KDMUOSI : ['B00KDMUMPS']
B00HL7VUFY : ['B00HN4IGUC']


#### Exploring similar products

In [74]:
num_products, num_absent = 0, 0
absent_similar_products = list()

for node, edges in amazon_graphs.graphs['similar'].graph.items():
    num_products += len(edges)
    for edge in edges:
        if edge not in title_to_id:
            num_absent += 1
            absent_similar_products.append(edge)
            

In [75]:
num_products, num_absent

(8129048, 5085441)

In [76]:
absent_similar_products

['Hot Pink Princess Cone Hat Headband',
 'Girls Princess Tiara Cone Hat Headband, Lavender',
 "elope Inc. Kid's Birthday Cake Hat",
 'Forum Novelties 78762 Festive Happy Birthday Cone Hat Adults Kids Pom Balls Circus Fancy Accessory Party Supplies, One Size',
 'Princess Paradise Thomas The Tank Engine Ride-in Train Costume, Blue, Child',
 'Princess Paradise Percy Ride-in Train Costume, Green, Child',
 "Disney's Monsters Inc Costume - Mike Wazowski Costume - Child S Size",
 'Red Cedar Boot & Shoe Care Shine Box - Shine Box Only',
 'Kiwi Select Shoe Shine Care Kit Valet II Wooden Box w/ 8 pc Content - NO SHOE POLISH INCLUDED',
 'FootFitter Superior 7 Piece Shoe Shine Valet Set! - Complete Brush Set Valet Kit!',
 'Shoe Shine Kit with PU Leather Sleek Elegant Case, 7-Piece Travel Shoe Shine Brush kit',
 'Stone & Clark 8PC Shoe Polish & Care Kit, Leather Shoe Shine Kit with Brown Wax, Shoe Brushes for Polishing, Shine Cloth & Shoe Horn,Compact Shoe Cleaning Kit With Shoes Shine Brush & PU L

### Code

In [87]:
amazon_graphs.replace_similar_graph_titles()

In [96]:
dict_head_random(amazon_graphs.graphs['similar'].graph)

B000C847E0 : ['B000C7YRNM', 'B00M37E4VO', 'B00180R5EU', 'B001KR5KSE']
B000099O64 : ['B0009KKDZS', 'B001UKT3MU', 'B0009KM3J2', 'B0009KM3JM']
B007C52J32 : ['B00P8D8WE0', 'B00C2VT862']
B00S055872 : ['B00RWY8PMW', 'B00S055M9Q', 'B00ROSES9K']
B007WIBK1G : ['B007WIC7IG', 'B007WIC7IG']
B00NVO5TOY : ['B008GWQ27O', 'B00XOUCNH6', 'B017DYKCNA', 'B00TR8YL4W', 'B00MUTWEM6']
B00EV5A384 : ['B0013KTUQ6', 'B0000223O9', 'B00DPE9ZMW']
B00I2WV68I : ['B00187DT7K', 'B00PJQHRWE']
B00BM26A0O : ['B00NWSGSQW']
B00HZPXXIE : ['B00IF5TAFI']


In [97]:
amazon_graphs.save_graph(save_dir, tag='_resolved', graph_type='similar')

## Read duplicates

In [33]:
duplicates_file = f'{dataset_home}/datasets/duplicates.txt'

In [34]:
def load_duplicates_map(duplicate_file):
    duplicates = {}
    
    with open(duplicates_file) as file:
        for line in file:
            product_ids = line[:-1].split(' ')
            
            if len(product_ids):
                representative_id = product_ids[0]
                for product_id in product_ids:
                    duplicates[product_id] = representative_id
                    
    return duplicates


In [35]:
duplicates = load_duplicates_map(duplicates_file)

In [36]:
dict_head_random(duplicates)

B01GK1ERRO : B01GK1EL7K
B016YV66K6 : B00ADCB7C0
B00ISF8KTM : B00ISF8QN2
0823404404 : 0823404404
0394562186 : 0394562186
B015OGDWPO : B015OGDWNG
B01FKVMOX4 : B01FKVMSF8
B00LVQKT9O : B00LVQKT9O
B00MV8LVDY : B00MV8LFCG
B00UXMQ5X6 : B00UXMPEVK


## Remove duplicates

In [176]:
g = {1:[22, 33, 44], 2:[1, 2, 3], 3:[99, 23, 12], 4:[11, 3, 13], 5:[4, 5]}
d = {1:100, 4:400}

In [177]:
delete_nodes = []

for node in g:
    if node in d:
        delete_nodes.append(node)
    else:
        for i, edge in enumerate(g[node]):
            if edge in d:
                g[node][i] = d[edge]
                
for node in delete_nodes:
    g[d[node]] = g[node]
    del g[node]

In [182]:
g

{2: [100, 2, 3],
 3: [99, 23, 12],
 5: [400, 5],
 100: [22, 33, 44],
 400: [11, 3, 13]}

### Code

In [None]:
save_dir = f'{dataset_home}/results'
amazon_graphs = AmazonGraphContainer()

amazon_graphs.load_idtotitle(save_dir)

amazon_graphs.load_graph(save_dir, graph_type='also_buy')
amazon_graphs.load_graph(save_dir, graph_type='also_view')
amazon_graphs.load_graph(save_dir, tag='_resolved', graph_type='similar')

In [48]:
dict_head_random(amazon_graphs.graphs['also_view'].graph, n=3)

1902579224 : ['0764155091', '0823055744', '1846031990', '1861088302', '0760323992', 'B001TO578Q', '0890247064', 'B0775K3XKS', '1600582141', 'B002X6DTHK', '1929133863', '1846032636', '0823001644', 'B008N8F6X2', 'B00EKT30JK', '0890242666', 'B00NLQ019A', 'B004O7HTYU', '1785004751', '8460860051', 'B003TJA0S6', 'B000BROV02', 'B00WBT7PTW', 'B01MTXRUUT', '0890249555', 'B010TQCOEE', '0890247234', 'B00YT4319K', 'B001ATALCW', '1902579232', 'B000BQKFAI', '1942932073']
B012UQYU48 : ['B005DST7NU', 'B000HI57Z8', 'B004WMB18W', 'B013GYDF76', 'B017HNQNHG', 'B01N0XM0YL', 'B012UUCMG2', 'B07DKTVQ3L', 'B00FURVI6S', 'B0186P8UPM', 'B0799V1J1W', 'B079ZZ18GS', 'B01MS44GG9', 'B078X2Z39S', 'B0714NKXKZ', 'B01M66U02V', 'B0799VRS8V', 'B0051IDAMQ', 'B078X2J8PM', 'B010Y45EWK', 'B00KR2Q3E8', 'B002R7IK74', 'B07DLC17QH', 'B01N30MU3L', 'B01NH2X6Y2', 'B01JP8NPWC', 'B07BKB7LGY', 'B00RHGFK5K', 'B07F5NQ1YK', 'B07BHYXY8G', 'B07DTD3BFN', 'B0755Y4X34', 'B07DSSRG8P', 'B01MRXGT40', 'B07CSQZ6SD', 'B074BPYNWZ', 'B07H2CKCNG', 'B00TT

In [46]:
dict_head_random(amazon_graphs.graphs['also_buy'].graph, n=3)

B00K7ZN0UA : ['B00M55C0NS', 'B01D1VLNWS', 'B00YZU6CVC', 'B073Z9BF6F', 'B010Q57T02']
0324360746 : ['1412928176', '1259732800', '0557380510', '0136090192', '1285874323', '0078038189', '0077862597', '0133507645', '1483317536', '0312425074', '0415249147', '0765638657', '0787903248', '0133548198', '0964668424', '0977216101', '0133848809', '047061353X', '1337095494', '0470620749', '0471127884', '1138223689', '1111842167', '0135005108', '1506316573', '0786305452', '1305280261', '0155063170', '1118821378', '0691176884', '0470527978', '1594481717', '1133190456', '0077862422', '007772903X', '1412939798', '020134596X', '1305280601', '0470293713', '1576753441', '1412972841', '1576752712', '0321811534', '1250141222', '0077861779', '1259278212', '0060959622', '1904838235', '1305580222', '0134639715', '1285860381', '1285866347', '1285436334', '0415728738', '0205198260', '1285425189', '0078028981', '0133859819', '0077733711', '1422186431', '0077862511', '0470650265', '1285426797', '1591473535', '15063

In [None]:
amazon_graphs.replace_graph_duplicates(duplicates)

## Train-test split

In [None]:
def prune_map(mapping, idxs):
    rev_mapping = {value:key for key, value in mapping.items()}
    pruned_mapping = {}
    for i, idx in enumerate(idxs):
        pruned_mapping[rev_mapping[idx]] = i
    return pruned_mapping

def split_count(num_samples, perc=0.7):
    if num_samples == 1:
        return 1 if np.random.rand() > 0.3 else 0
    
    num_train = int( np.ceil(num_samples*perc) )
    if num_train == num_samples and num_samples > 1:
        num_train -= 1
    return num_train


In [None]:
class AmazonSplit:
    
    def __init__(self, graph=None):
        self.graph = None
        self.labels = None
        self.doc_to_rowindex = None
        
        if graph:
            _ = self.to_matrix(graph)
        
        self.train, self.test = None, None
        self.trn_tst_labels = None
        self.train_doc_to_rowindex, self.test_doc_to_rowindex = None, None
        
    def to_matrix(self, graph):
        indptr = [0]
        indices = []
        data = []

        self.doc_to_rowindex = {}

        self.labels = {}
        for i, (doc, edge_count) in enumerate(graph.items()):
            self.doc_to_rowindex[doc] = i
            
            for link, cnt in edge_count.items():
                index = self.labels.setdefault(link, len(self.labels))
                indices.append(index)
                data.append(cnt)
            indptr.append(len(indices))

        self.graph = csr_matrix((data, indices, indptr), dtype=int)
        return self.graph, self.labels, self.doc_to_rowindex
    
    def clean_matrix(self, clean_type=0):
        if clean_type == 0:
            self.graph, self.labels, self.doc_to_rowindex = self.remove_single_labels(self.graph,
                                                                                      self.labels,
                                                                                      self.doc_to_rowindex)
        elif clean_type == 1:
            pruned_rows = self.get_pruned_row(self.graph)
            self.graph, self.doc_to_rowindex = self.prune_graph_rows(self.graph,
                                                                     self.doc_to_rowindex,
                                                                     pruned_rows)
        else:
            pruned_cols = self.get_pruned_cols(self.graph)
            self.graph, self.labels = self.prune_graph_cols(self.graph,
                                                           self.labels,
                                                           pruned_cols)
    
    """
    def get_split_idx(self):
        train_rowidx = []
        test_rowidx = []
        
        num_rows, num_cols = self.graph.shape
        row_available_flag = np.ones(num_rows, dtype=bool)
        
        label_cnt = np.array(self.graph.sum(axis=0)).reshape(-1)
        uni_label_cnt = np.unique(label_cnt)
        
        for lcnt in uni_label_cnt:
            if lcnt > upper_threshold:
                break
                
            label_pos = np.where(label_cnt == lcnt)[0]
            row_numlabels = np.array(self.graph[:, label_pos].sum(axis=1)).reshape(-1)
            
            row_idx = np.where(row_numlabels > 0)[0]
            row_available_flag[row_idx] = False
            num_train = split_count(row_idx.shape[0], perc=0.7)
            row_idx = list(np.random.permutation(row_idx))

            train_rowidx.extend(row_idx[:num_train])
            test_rowidx.extend(row_idx[num_train:])
        
        row_idx = np.where(row_available_flag == True)[0]
        num_train = split_count(row_idx.shape[0], perc=0.7)
        row_idx = list(np.random.permutation(row_idx))
        train_rowidx.extend(row_idx[:num_train])
        test_rowidx.extend(row_idx[num_train:])
        
        return train_rowidx, test_rowidx
        """
    
    def get_split_idx(self, upper_threshold=10, perc=0.7):
        train_rowidx = []
        test_rowidx = []
        
        row_idxs, col_idxs = self.graph.nonzero()
        sort_idx = np.argsort(col_idxs)
        row_idxs = row_idxs[sort_idx]
        col_idxs = col_idxs[sort_idx]
        
        num_rows = self.graph.shape[0]
        row_inserted_flag = np.zeros(num_rows, dtype=bool)
        
        label_cnt = np.array(self.graph.getnnz(axis=0)).reshape(-1)
        uni_label_cnt = np.unique(label_cnt)
        
        """
        print(label_cnt)
        print(uni_label_cnt)
        """
        
        cnt = 0
        for lcnt in uni_label_cnt:
            if cnt == num_rows or lcnt >= upper_threshold:
                break
                
            pos_ptr, col_ptr = 0, 0
            pos_idxs = np.where(label_cnt == lcnt)[0]
            pos_idxs.sort()

            while col_ptr < len(col_idxs) and pos_ptr < len(pos_idxs) and cnt < num_rows:
                if pos_idxs[pos_ptr] != col_idxs[col_ptr]:
                    col_ptr += 1
                else:
                    sample_row_idxs = []
                    while col_ptr < len(col_idxs) and pos_ptr < len(pos_idxs) and \
                    cnt < num_rows and pos_idxs[pos_ptr] == col_idxs[col_ptr]:
                        rn = row_idxs[col_ptr]
                        if not row_inserted_flag[rn]:
                            sample_row_idxs.append(rn)
                            row_inserted_flag[rn] = True
                            cnt += 1
                        col_ptr += 1
                    pos_ptr += 1
                    
                    num_train = split_count(len(sample_row_idxs), perc=0.7)
                    sample_row_idxs = list(np.random.permutation(sample_row_idxs))
                    train_rowidx.extend(sample_row_idxs[:num_train])
                    test_rowidx.extend(sample_row_idxs[num_train:])
                    """
                    print(f'Sample row : {sample_row_idxs}')
                    print(f'train : {train_rowidx}')
                    print(f'test  : {test_rowidx}')
                    """
                    
        sample_row_idxs = np.where(row_inserted_flag == False)[0]
        num_train = split_count(num_rows, perc=0.7)
        num_train -= len(train_rowidx)
        
        sample_row_idxs = list(np.random.permutation(sample_row_idxs))
        train_rowidx.extend(sample_row_idxs[:num_train])
        test_rowidx.extend(sample_row_idxs[num_train:])
        
        return train_rowidx, test_rowidx
    
    def get_split_bylabel(self, upper_threshold=10, perc=0.7):
        """
        splitting data into train-test
        """
        train_idx, test_idx = self.get_split_idx(upper_threshold, perc)
        
        self.train = self.graph[train_idx, :]
        self.test = self.graph[test_idx, :]
        self.trn_tst_labels = self.labels
        
        rowindex_to_doc = {row_idx:doc for doc, row_idx in self.doc_to_rowindex.items()}
        
        self.train_doc_to_rowindex = {rowindex_to_doc[idx]:i for i, idx in enumerate(train_idx)}
        self.test_doc_to_rowindex = {rowindex_to_doc[idx]:i for i, idx in enumerate(test_idx)}
        
        """
        pruning the columns
        """
        trn_pruned_cols = self.get_pruned_cols(self.train)
        tst_pruned_cols = self.get_pruned_cols(self.test)
        pruned_cols = np.intersect1d(trn_pruned_cols, tst_pruned_cols)
        self.train = self.train[:, pruned_cols]
        self.test = self.test[:, pruned_cols]
        self.trn_tst_labels = prune_map(self.trn_tst_labels, pruned_cols)
        
        """
        prunning the rows
        """
        pruned_rows = self.get_pruned_row(self.train)
        self.train, self.train_doc_to_rowindex = self.prune_graph_rows(self.train, 
                                                                       self.train_doc_to_rowindex, pruned_rows)
        pruned_rows = self.get_pruned_row(self.test)
        self.test, self.test_doc_to_rowindex = self.prune_graph_rows(self.test,
                                                                     self.test_doc_to_rowindex, pruned_rows)
        
    def get_random_split_idx(self, perc=0.7):
        n_docs = self.graph.shape[0]
        n_trn = int(perc * n_docs)
        rand_idx = np.random.permutation(n_docs)
        return rand_idx[:n_trn], rand_idx[n_trn:]
    
    def get_pruned_cols(self, graph, count=0):
        label_cnt = np.array(graph.getnnz(axis=0)).reshape(-1)
        pruned_cols = np.where(label_cnt > count)[0]
        return pruned_cols
    
    def get_pruned_row(self, graph, count=0):
        pruned_rows = np.where( np.array(graph.getnnz(axis=1)).reshape(-1) > count )[0]
        return pruned_rows
    
    def prune_graph_cols(self, graph, labels, pruned_cols):
        graph = graph[:, pruned_cols]
        labels = prune_map(labels, pruned_cols)
        return graph, labels
    
    def prune_graph_rows(self, graph, doc_to_rowindex, pruned_rows):
        graph = graph[pruned_rows, :]
        doc_to_rowindex = prune_map(doc_to_rowindex, pruned_rows)
        return graph, doc_to_rowindex
    
    def remove_single_labels(self, graph, labels, doc_to_rowindex):
        pruned_cols = self.get_pruned_cols(graph, count=1)
        graph, labels = self.prune_graph_cols(graph, labels, pruned_cols)
        
        pruned_rows = self.get_pruned_row(graph)
        graph, doc_to_rowindex = self.prune_graph_rows(graph, doc_to_rowindex, pruned_rows)

        return graph, labels, doc_to_rowindex
        
    def get_split_byrandom(self, perc=0.7):
        train_idx, test_idx = self.get_random_split_idx(perc)
        
        self.train = self.graph[train_idx, :]
        self.test = self.graph[test_idx, :]
        rowindex_to_doc = {row_idx:doc for doc, row_idx in self.doc_to_rowindex.items()}
        self.train_doc_to_rowindex = {rowindex_to_doc[idx]:i for i, idx in enumerate(train_idx)}
        self.test_doc_to_rowindex = {rowindex_to_doc[idx]:i for i, idx in enumerate(test_idx)}
        
        train_pruned_cols = self.get_pruned_cols(self.train)
        test_pruned_cols = self.get_pruned_cols(self.test)
        pruned_cols = np.intersect1d(train_pruned_cols, test_pruned_cols)
        
        self.train = self.train[:, pruned_cols]
        self.test = self.test[:, pruned_cols]
        self.trn_tst_labels = prune_map(self.labels, pruned_cols)
        
        pruned_rows = self.get_pruned_row(self.train)
        self.train, self.train_doc_to_rowindex = self.prune_graph_rows(self.train, 
                                                                       self.train_doc_to_rowindex, 
                                                                       pruned_rows)
        pruned_rows = self.get_pruned_row(self.test)
        self.test, self.test_doc_to_rowindex = self.prune_graph_rows(self.test,
                                                                     self.test_doc_to_rowindex,
                                                                     pruned_rows)
        
    def save_data(self, save_dir, tag='category'):
        train_file = f'{save_dir}/{tag}_train.pkl'
        with open(train_file, 'wb') as fout:
            train = (self.trn_tst_labels, self.train_doc_to_rowindex, self.train)
            pickle.dump(train, fout)
            
        test_file = f'{save_dir}/{tag}_test.pkl'
        with open(test_file, 'wb') as fout:
            test = (self.trn_tst_labels, self.test_doc_to_rowindex, self.test)
            pickle.dump(test, fout)
            
    def load_data(self, save_dir, tag='category'):
        train_file = f'{save_dir}/{tag}_train.pkl'
        with open(train_file, 'rb') as fout:
            train = pickle.load(fout)
            self.trn_tst_labels, self.train_doc_to_rowindex, self.train = train
            
        test_file = f'{save_dir}/{tag}_test.pkl'
        with open(test_file, 'rb') as fout:
            test = pickle.load(fout)
            _, self.test_doc_to_rowindex, self.test = test
            

In [None]:
combined_dir = f'{dataset_home}/AmazonProducts/combined'

data_splitter = WikipediaSplit(classification_graph)

data_splitter.clean_matrix()
data_splitter.get_split_bylabel(upper_threshold=10)

In [None]:
data_splitter.save_data(save_dir, tag='wiki_category')

In [None]:
save_dir