In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from tqdm import tqdm
from ast import literal_eval
import os
tqdm.pandas()

In [3]:
# Import the metadata. We'll only import the columns we need.
if not os.path.exists("../Data/metadata_raw.pkl"): #Converts json to pkl if needed
    print("Converting json file to pkl")
    metadata_df_raw = pd.read_json("../Data/amazon_meta.json",lines=True)
    metadata_df_raw.to_pickle('../Data/metadata_raw.pkl')
    del metadata_df_raw
columns=['asin','category', 'title', 'price','also_buy', 'also_view']
combined_df=pd.read_pickle("../Data/metadata_raw.pkl")[columns]

##### Cleaning Metadata

In [4]:
# Remove price columns starting .a-box-inner

def filter_junk(price):
    if price=='':
        return None
    if len(price)>=12:
        if price[0:12]=='.a-box-inner':
            return None
    return price

combined_df.price=combined_df.price.apply(filter_junk)

In [5]:
# Add missing_price feature
# Might indicate item is no longer for sale/out of stock?
combined_df['missing_price']=combined_df.price.isna()

In [6]:
# Remove price
combined_df=combined_df.drop(columns='price')

In [7]:
# Replace category with top-level subcategory
def extract_subcategory(cat):
    if len(cat)>0:
        return cat[1]
    return None

combined_df['category']=combined_df.category.progress_apply(extract_subcategory)

100%|██████████| 633883/633883 [00:00<00:00, 1854122.11it/s]


In [8]:
# Combine also_buy and also_view into a single list
def combine(entry):
    output=entry.also_buy+entry.also_view
    output=list(set(output)) # Remove duplicates
    if output==[]:
        return None
    return output
    
combined_df['similar']=combined_df.progress_apply(combine,axis=1)

100%|██████████| 633883/633883 [00:07<00:00, 82600.39it/s] 


In [9]:
# We now fill in missing categories
# We create a lookup table of all items that already have a category.
lookup_cat=dict()
for index,entry in combined_df[combined_df.category.notna()].iterrows():
    lookup_cat[entry.asin]=entry.category

In [10]:
# The following function predicts the category using the category of similar (also_view or also_buy) products.
# We pick the most frequently ocurring such category
from collections import Counter

def most_frequent(lst):
    return Counter(lst).most_common(1)[0][0]

def predict_category_similar_prods(entry,i=0):
    if entry.category!=None:
        return entry.category
    similar_prods=entry.similar
    if similar_prods==None:
        return None
    similar_categories=[]
    for prod in similar_prods:
        category=lookup_cat.get(prod,-1)
        if category!=-1:
            similar_categories.append(category)
    if len(similar_categories)>0:
        return most_frequent(similar_categories)
    return None

In [11]:
# Fill in as many missing category values as possible
combined_df.category=combined_df.apply(predict_category_similar_prods,axis=1)

In [12]:
# Drop non-hashable columns that are no longer needed
combined_df=combined_df.drop(columns=['also_buy','also_view','similar'])

In [13]:
# Drop duplicate rows
combined_df=combined_df.drop_duplicates()

In [14]:
# Verify asins are unique
assert(combined_df.asin.duplicated().unique()==[False])

In [15]:
# Import the ranks.
ranks_df=pd.read_parquet("../Data/meta_ranks.parquet")
ranks_df=ranks_df.drop_duplicates()

# Verify no duplicated asins
assert(ranks_df.asin.duplicated().unique()==[False])

In [16]:
# Merge ranks with combined_df
combined_df=combined_df.merge(ranks_df,on='asin')

In [17]:
# Drop missing items with missing rank or missing category
pd.DataFrame.dropna(combined_df,axis=0,subset=['category','item_rank'],inplace=True)

##### Add matches and components

In [18]:
# Import matches with components
matches_df=pd.read_csv("../Data/amazon_df_labels_with_comps.csv",index_col=0)

In [19]:
# Merge matches with combined_df
combined_df=combined_df.merge(matches_df,on='asin')

In [20]:
# Import undropped asins after cleaning review_data.
review_cleaned_asins=pd.read_csv("../Data/asin_labels_clean_review_df.csv")

# Check no duplicate asin
assert(review_cleaned_asins.asin.duplicated().unique()==[False])

In [21]:
# Drop entries from combined_df that don't appear in review_cleaned_asins
combined_df=combined_df.merge(review_cleaned_asins[['asin']],on='asin')

##### Add reviews features

In [22]:
# Load Reviews_df (from reviews_features.ipynb)
reviews_features_df=pd.read_parquet("final_reviews.parquet")

# Verify no duplicated asins
assert(reviews_features_df.asin.duplicated().unique()==[False])

In [23]:
# Merge meta and review datasets
combined_df=combined_df.merge(reviews_features_df,on='asin')

In [24]:
# Drop num_of_rating column as it is identical to reviews_per_product
combined_df.drop(columns=['num_of_rating'])

Unnamed: 0,asin,category,title,missing_price,item_rank,match,incident_indices,component_no,avg_rating,min_rating,...,avg_verified_reviewers,min_date,max_date,product_lifespan,num_bots_per_asin,unique_reviewer_count,avg_reviews_per_day,reviews_per_product,avg_review_length_words,avg_review_length_chars
0,0000191639,Puzzles,Dr. Suess 19163 Dr. Seuss Puzzle 3 Pack Bundle,True,2230717.0,0,[],449,5.000000,5,...,1.000000,2013-12-26,2013-12-26,0 days,0,1,3.000000,1,23.000000,125.000000
1,0004983289,Games,Dutch Blitz Card Game,False,376337.0,0,[],452,4.800000,5,...,1.000000,2016-12-10,2018-03-28,473 days,0,5,0.801198,5,15.400000,107.200000
2,0020232233,Grown-Up Toys,Dungeons &amp; Dragons - &quot;Storm Kings Thu...,False,178217.0,0,[],454,4.130435,5,...,0.782609,2016-09-12,2018-04-06,571 days,0,23,0.642518,23,59.260870,329.608696
3,0096737581,Arts & Crafts,NUM NOMS figures Storage Case Organizer - hold...,True,989767.0,0,[],455,2.333333,5,...,0.666667,2016-12-03,2017-07-08,217 days,0,3,0.671922,3,22.666667,126.000000
4,014002316X,Toy Remote Control & Play Vehicles,UDI U806 Infrared Remote Control Helicopter W/...,True,3687991.0,0,[],456,1.000000,1,...,1.000000,2015-09-25,2015-09-25,0 days,0,1,1.000000,1,12.000000,52.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547713,B01HJC53OE,Action Figures & Statues,Marvel Funko Pop Black Suit Spider-Man #79 (Gl...,True,482069.0,0,[],623284,4.500000,5,...,1.000000,2017-07-23,2018-03-09,229 days,0,4,1.010479,4,21.750000,109.750000
547714,B01HJDFWDK,Hobbies,Geilienergy 6V 2000mAh NiMH RX Battery Packs w...,False,350404.0,0,[],623285,4.400000,5,...,0.966667,2016-10-06,2018-10-04,728 days,0,30,0.615794,30,16.333333,86.133333
547715,B01HJDGVFS,Sports & Outdoor Play,Micord Baby Float Toddler Swimming Inflatable ...,True,652169.0,0,[],623286,4.200000,5,...,1.000000,2016-08-26,2018-08-13,717 days,0,5,0.603825,5,11.600000,57.400000
547716,B01HJDUNRU,Sports & Outdoor Play,"Premium Swimming Pool Float Hammock, Inflatabl...",False,253066.0,0,[],623287,4.222222,5,...,0.944444,2016-08-13,2018-09-05,753 days,0,36,0.760222,36,38.944444,204.194444


##### Add summary and review embeddings

In [25]:
# Load summary embeddings
summary_embeddings=pd.read_pickle("../Data/agg_summary_embeddings.pkl")

# Verify no duplicated asins
assert(summary_embeddings.asin.duplicated().unique()==[False])

In [26]:
combined_df=combined_df.merge(summary_embeddings,on='asin',how='left').set_axis(combined_df.index)

In [27]:
# Load review embeddings
review_embeddings=pd.read_pickle("../Data/reviewtext_features_df.pkl")

# Verify no duplicated asins
assert(review_embeddings.asin.duplicated().unique()==[False])

In [28]:
combined_df=combined_df.merge(review_embeddings,on='asin',how='left').set_axis(combined_df.index)

##### Randomly dropping items labelled zero

To obtain a more manageable data set, we randomly drop items whose ``match`` column is zero.

In [29]:
num_zeros=(combined_df.match==0).sum()
num_ones=(combined_df.match==1).sum()
print(f"There are {num_zeros} products labelled 0.")
print(f"There are {num_ones} products labelled 1.")

There are 546348 products labelled 0.
There are 1370 products labelled 1.


In [30]:
# This is the number of 0s in our final dataset.
TARGET=200_000

# Check TARGET is less than the total number of zeros
assert(TARGET<num_zeros)

In [31]:
# indices of items labelled zero
zero_indices=combined_df[combined_df.match==0].index
# indices of items labelled one
one_indices=combined_df[combined_df.match==1].index

# random subset of TARGET items labelled zero
rng=np.random.default_rng(seed=1067)
random_subset=rng.choice(zero_indices,TARGET,replace=False)

# total list of indices
indices=np.concat([random_subset,one_indices])

In [32]:
# Drop rows of combined_df with index not in indices
combined_df=combined_df.loc[indices]

In [33]:
print(combined_df.match.value_counts())

match
0    200000
1      1370
Name: count, dtype: int64


##### Stratified train-test split preserving groups

In [34]:
from custom_ttsplit import StratifiedGroupSplit

In [35]:
df_train,df_test=StratifiedGroupSplit(combined_df,'match','component_no',test_size=0.2,random_state=1066)

In [36]:
# Check test ratio
df_test.shape[0]/combined_df.shape[0]

0.2

In [37]:
# Check ratio of 1s in set before split
combined_df[(combined_df.match)==1].shape[0]/combined_df.shape[0]

0.0068033967323831756

In [38]:
# Check ratio of 1s in test set
df_test[(df_test.match)==1].shape[0]/df_test.shape[0]

0.0068033967323831756

In [39]:
# Check ratio of 1s in training set
df_train[(df_train.match)==1].shape[0]/df_train.shape[0]

0.0068033967323831756

These numbers are all very close.

In [40]:
# Check there are no component overlaps
comps_in_test=set(df_test.component_no.unique())
comps_in_train=set(df_train.component_no.unique())
assert(comps_in_train.intersection(comps_in_test)==set())

In [41]:
# Do not change these files unless the train-test split is changed
# df_train[['asin']].to_parquet("../Data/asins_in_splits/train_asins.parquet",compression='gzip')
# df_test[['asin']].to_parquet("../Data/asins_in_splits/test_asins.parquet",compression='gzip')

In [42]:
# Check we haven't changed the train-test split asins from the split on 13 Jun
saved_train=pd.read_parquet("../Data/asins_in_splits/train_asins.parquet")
saved_test=pd.read_parquet("../Data/asins_in_splits/test_asins.parquet")
assert(saved_train.shape[0]==df_train.shape[0])
assert((saved_train.asin!=df_train.asin).sum()==0)
assert(saved_test.shape[0]==df_test.shape[0])
assert((saved_test.asin!=df_test.asin).sum()==0)

##### Save to compressed parquet

##### Additional train_final,validation split

In [45]:
df_train_final,df_validation=StratifiedGroupSplit(df_train,'match','component_no',test_size=0.2,random_state=1043)

In [46]:
# Check test ratio
df_validation.shape[0]/df_train.shape[0]

0.19999875850424592

In [47]:
# Check ratio of 1s in set before split
df_train[(df_train.match)==1].shape[0]/df_train.shape[0]

0.0068033967323831756

In [48]:
# Check ratio of 1s in validation set
df_validation[(df_validation.match)==1].shape[0]/df_validation.shape[0]

0.00679723144728266

In [49]:
# Check ratio of 1s in train_final set
df_train_final[(df_train_final.match)==1].shape[0]/df_train_final.shape[0]

0.006804938041698674

These numbers are all very close.

In [50]:
# Check there are no component overlaps
comps_in_train_final=set(df_train_final.component_no.unique())
comps_in_valid=set(df_validation.component_no.unique())
assert(comps_in_train_final.intersection(comps_in_valid)==set())

In [51]:
# # Do not change these files unless the train-test split is changed
# df_train_final[['asin']].to_parquet("../Data/asins_in_splits/train_final_asins.parquet",compression='gzip')
# df_validation[['asin']].to_parquet("../Data/asins_in_splits/validation_asins.parquet",compression='gzip')

In [52]:
# Check we haven't changed the train_final,valid split asins from the split on 15 Jun
saved_train_final=pd.read_parquet("../Data/asins_in_splits/train_final_asins.parquet")
saved_valid=pd.read_parquet("../Data/asins_in_splits/validation_asins.parquet")
assert(saved_train_final.shape[0]==df_train_final.shape[0])
assert((saved_train_final.asin!=df_train_final.asin).sum()==0)
assert(saved_valid.shape[0]==df_validation.shape[0])
assert((saved_valid.asin!=df_validation.asin).sum()==0)

##### Drop uneeded columns

In [60]:
columns=combined_df.columns

In [63]:
columns[0:30]

Index(['asin', 'category', 'title', 'missing_price', 'item_rank', 'match',
       'incident_indices', 'component_no', 'avg_rating', 'min_rating',
       'num_of_rating', 'percent_positive', 'percent_negative',
       'avg_verified_reviewers', 'min_date', 'max_date', 'product_lifespan',
       'num_bots_per_asin', 'unique_reviewer_count', 'avg_reviews_per_day',
       'reviews_per_product', 'avg_review_length_words',
       'avg_review_length_chars', 'embed_0', 'embed_1', 'embed_2', 'embed_3',
       'embed_4', 'embed_5', 'embed_6'],
      dtype='object')

In [77]:
indices_to_drop=[0,2,6,7,10,22]
print("Columns to drop:")
print(columns[indices_to_drop])

Columns to drop:
Index(['asin', 'title', 'incident_indices', 'component_no', 'num_of_rating',
       'avg_review_length_chars'],
      dtype='object')


In [81]:
combined_df=combined_df[[col for i,col in enumerate(columns) if i not in indices_to_drop ]]

##### Save to parquet files

In [82]:
# df_train.to_parquet("../Data/train_v.parquet", compression='gzip')

In [83]:
df_test.to_parquet("../Data/test_v2.parquet", compression='gzip')

In [84]:
df_train_final.to_parquet("../Data/train_final_v2.parquet", compression='gzip')

In [85]:
df_validation.to_parquet("../Data/validation_v2.parquet", compression='gzip')

In [None]:
# # Save md5sums of files
# import hashlib

# def calculate_md5(filepath):
#     md5_hash = hashlib.md5()
#     with open(filepath, "rb") as file:
#         # Read the file in chunks to handle large files
#         for chunk in iter(lambda: file.read(4096), b""):
#             md5_hash.update(chunk)
#     return md5_hash.hexdigest()

# os.chdir("../Data/")
# file_list=["test_v2.parquet","train_final_v2.parquet","validation_v2.parquet"]
# output_file = "../Data/md5_checksums.txt"
# with open(output_file, "w") as f:
#     for file_path in file_list:
#         md5_value = calculate_md5(file_path)
#         f.write(f"{md5_value}  {file_path}\n")
# os.chdir("../feature_extractions")