In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from tqdm import tqdm
from ast import literal_eval
tqdm.pandas()

In [2]:
# Import the metadata. We'll only import the columns we need.
columns=['category', 'title', 'also_buy', 'also_view',   'asin','price']
meta_chunks=pd.read_json('../Data/amazon_meta.json',chunksize=10_000,lines=True)
meta_df=pd.concat([chunk[columns] for chunk in meta_chunks])

In [3]:
# Import the ranks.
ranks_df=pd.read_parquet("../Data/meta_ranks.parquet")

In [4]:
# Merge ranks with meta_df
meta_df=meta_df.join(ranks_df.item_rank)

In [5]:
# Import matches
matches_df=pd.read_csv("../Data/amazon_df_labels.csv")


In [6]:
# Merge matches with meta_df
meta_df=pd.concat([meta_df,matches_df[['match','incident_indices']]],axis=1)

In [7]:
# Remove price columns starting .a-box-inner

def filter_junk(price):
    if price=='':
        return None
    if len(price)>=12:
        if price[0:12]=='.a-box-inner':
            return None
    return price

meta_df.price=meta_df.price.apply(filter_junk)


In [8]:
# Add missing_price feature
# Might indicate item is longer for sale?
meta_df['missing_price']=meta_df.price.isna()

In [9]:
# Replace category with top-level subcategory
def extract_subcategory(cat):
    if len(cat)>0:
        return cat[1]
    return None

meta_df['category']=meta_df.category.progress_apply(extract_subcategory)

100%|██████████| 633883/633883 [00:00<00:00, 640603.19it/s]


In [10]:
# Combine also_buy and also_view into single list
def combine(entry):
    output=entry.also_buy+entry.also_view
    output=list(set(output)) # Remove duplicates
    if output==[]:
        return None
    return output
    
meta_df['similar']=meta_df.progress_apply(combine,axis=1)

100%|██████████| 633883/633883 [00:18<00:00, 33741.07it/s]


In [11]:
columns=['asin', 'title', 'category' ,'missing_price', 'price',
       'item_rank',  'similar','match','incident_indices']
meta_df=meta_df[columns]

In [49]:
asin_duplicate=asin_count[meta_df.asin.value_counts()>1].index

In [73]:
# Change meta_df.similar from list to tuple in order to work with duplicates
def to_tuple(lis):
    if lis==None:
        return None
    return tuple(lis)

meta_df.similar=meta_df.similar.apply(to_tuple)

In [72]:
# Remove duplicate rows 
meta_df=meta_df.drop_duplicates()

# Verify asins are now unique
assert(meta_df.asin.duplicated().sum()==0)

In [74]:
meta_df.to_parquet("../Data/meta_df.parquet")

In [75]:
meta_df

Unnamed: 0,asin,title,category,missing_price,price,item_rank,similar,match,incident_indices
0,0000191639,Dr. Suess 19163 Dr. Seuss Puzzle 3 Pack Bundle,Puzzles,True,,2230717.0,,0,[]
1,0004950763,Pathfinder: Book of Beasts - Legendary Foes,,True,,2294535.0,,0,[]
2,0005069491,Nursery Rhymes Felt Book,,True,,2871983.0,,0,[]
3,0004983289,Dutch Blitz Card Game,Games,False,$24.95,376337.0,"(B00G7S4V54, B003GZH0SG, B002TBFVV6, B016715XX...",0,[]
4,0006466222,Magic Pen Painting (Marvel Super Heroes),,False,$4.92,934954.0,"(B009SB8Z1I, B009SB7TI8)",0,[]
...,...,...,...,...,...,...,...,...,...
633878,B01HJDFWDK,Geilienergy 6V 2000mAh NiMH RX Battery Packs w...,Hobbies,False,$10.99,350404.0,"(B07192B34X, B0015H2V72, B073F92G2S, B0015KLJE...",0,[]
633879,B01HJDGVFS,Micord Baby Float Toddler Swimming Inflatable ...,Sports & Outdoor Play,True,,652169.0,,0,[]
633880,B01HJDUNRU,"Premium Swimming Pool Float Hammock, Inflatabl...",Sports & Outdoor Play,False,$22.99,253066.0,"(B000OV0X4S, B071P48Y92, B01IBPOO9S, B073WMLBG...",0,[]
633881,B01HJFAGJI,Lewo Wooden Baby Toddler Toys Circle First Bea...,Learning & Education,False,$19.99,36248.0,,0,[]
