## imports

In [96]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import itertools
import pandas as pd
import _pickle as cPickle
from pandas import DataFrame
from pandas import Series
import os

## function definitions

In [84]:
class Loader(object):
    """
    interface for data loader wrappers
    """
    def load_data(self, file_name):
        """
        loads the data into memory
        :param file_name: name of the file to load
        :return: pandas object
        """
        raise NotImplementedError

In [85]:
import gzip
class JSONLoader(Loader):
    def __init__(self, read_mode='rb'):
        self.read_mode = read_mode

    def load_data(self, file_name):
        """
        :param file_name: complete path to open
        :return: pandas dataframe
        """
        try:
            i = 0
            df = {}
            for d in self._parse(file_name):
                df[i] = d
                i += 1
            return pd.DataFrame.from_dict(df, orient='index')
        except Exception as e:
            raise e

    def _parse(self, file_name):
        g = gzip.open(file_name, self.read_mode)
        for l in g:
            yield eval(l)

In [86]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [87]:
def get_cat(l):
    return l[-2]

## data loading

In [88]:
# file_path corresponds to the file of the .gz file which contains the JSON file. 
product_path = '/media/sarthak/HDD/TUM/courses/sem_3/practical DM/datasets/meta_Electronics.json.gz'
#product_path = 'D:\\TUM\\courses\\sem_3\\practical DM\\datasets\\meta_Electronics.json.gz'
loader = JSONLoader()
product = loader.load_data(product_path)

In [89]:
product.head()

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
0,132793040,http://ecx.images-amazon.com/images/I/31JIPhp%...,The Kelby Training DVD Mastering Blend Modes i...,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Mastering Blend Modes in A...,,,,
1,321732944,http://ecx.images-amazon.com/images/I/31uogm6Y...,,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Adobe Photoshop CS5 Crash ...,,,,
2,439886341,http://ecx.images-amazon.com/images/I/51k0qa8f...,Digital Organizer and Messenger,"[[Electronics, Computers & Accessories, PDAs, ...",Digital Organizer and Messenger,8.15,{'Electronics': 144944},"{'also_viewed': ['0545016266', 'B009ECM8QY', '...",
3,511189877,http://ecx.images-amazon.com/images/I/41HaAhbv...,The CLIKR-5 UR5U-8780L remote control is desig...,"[[Electronics, Accessories & Supplies, Audio &...",CLIKR-5 Time Warner Cable Remote Control UR5U-...,23.36,,"{'also_viewed': ['B001KC08A4', 'B00KUL8O0W', '...",
4,528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",


In [90]:
product.categories = product.categories.apply(flatten)
product.categories = product.categories.apply(get_cat)

# code that did the magic of running in sub-seconds complexity.
#sin_cat_dict = Series(product.categories.values,index=product.asin).to_dict()

In [91]:
product.categories.value_counts()

Laptop & Netbook Computer Accessories    43296
Computers & Accessories                  28347
Cases & Sleeves                          27839
Bags & Cases                             23998
Cables & Accessories                     23147
Audio & Video Accessories                21925
MP3 Player Accessories                   21040
Computer Components                      19591
Cables & Interconnects                   13345
Data Storage                             12923
Touch Screen Tablet Accessories          12878
Electronics                              11002
Batteries                                 8707
Digital Cameras                           8157
Batteries, Chargers & Accessories         7613
eBook Readers & Accessories               6880
Speakers                                  6804
Video Cables                              6744
Digital Camera Accessories                6661
Accessories                               5620
Memory Cards                              5185
MP3 Players &

## creating dataset

In [92]:
# change the threshold to experiment
threshold = 7000
percent = []
cats = []
counter = 0
counts = product.categories.value_counts()
for key, val in counts.iteritems():
    if val >= threshold:
        counter+=1
        percent.append( (val/product.shape[0]) * 100)
        cats.append(key)

In [103]:
product_cat_subset = product[product.categories.isin(cats)]
product_cat_subset.categories.value_counts()

Laptop & Netbook Computer Accessories    43296
Computers & Accessories                  28347
Cases & Sleeves                          27839
Bags & Cases                             23998
Cables & Accessories                     23147
Audio & Video Accessories                21925
MP3 Player Accessories                   21040
Computer Components                      19591
Cables & Interconnects                   13345
Data Storage                             12923
Touch Screen Tablet Accessories          12878
Electronics                              11002
Batteries                                 8707
Digital Cameras                           8157
Batteries, Chargers & Accessories         7613
Name: categories, dtype: int64

In [94]:
# number of categories
len(cats)

15

In [95]:
# percent of data we are using
sum(percent)

56.96713743185412

## Downloading

In [97]:
# change the project path to root of the repository. make sure that datasets folder is added to .gitignore
project_path = '/media/sarthak/HDD/TUM/courses/sem_3/DLCV/project'
datasets_path = os.path.join(project_path, 'datasets')

In [98]:
if not os.path.exists(datasets_path):
    os.makedirs(datasets_path)

In [None]:
import wget
import random
import time
for cat in cats:
    
    # switch off the download_cutoff_activate if you want to download all images in the category.
    download_cutoff_activate = True
    
    # change the download cutoff if required, minimum it should be 7k
    download_cutoff = 10000
    product_cat_subset_subset = product_cat_subset[product_cat_subset.categories == cat]
    cat_path = os.path.join(datasets_path, cat)
    if not os.path.exists(cat_path):
        os.makedirs(cat_path)
    os.chdir(cat_path)
    imurls = product_cat_subset_subset.imUrl.tolist()
    imurls = list(set(imurls))
    if download_cutoff_activate and len(imurls) > download_cutoff:
        # randomly sample 10k urls from categories that contain more than 10k images to reduce download time
        imurls = random.sample(imurls, download_cutoff)
        
        # sanity check that sampling worked correctly
        assert len(imurls) == download_cutoff
        
    print('number of urls to be downloaded for category: ' + cat + ' is: ' + str(len(imurls)))
    for idx, url in enumerate(imurls):
        try:
            # logging
            if idx % 1000 == 0:
                print('images downloaded: ' + str(idx))
            
            # download the image using wget in cat_path
            file = wget.download(url)
        except Exception as e:
            pass

number of urls to be downloaded for category: Laptop & Netbook Computer Accessories is: 10000
images downloaded: 0
