In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from tqdm import tqdm
from ast import literal_eval
import os
tqdm.pandas()

In [2]:
# Import the metadata. We'll only import the columns we need.
if not os.path.exists("../Data/metadata_raw.pkl"): #Converts json to pkl if needed
    print("Converting json file to pkl")
    metadata_df_raw = pd.read_json("../Data/amazon_meta.json",lines=True)
    metadata_df_raw.to_pickle('../Data/metadata_raw.pkl')
    del metadata_df_raw
columns=['asin','category', 'title', 'price','also_buy', 'also_view']
combined_df=pd.read_pickle("../Data/metadata_raw.pkl")[columns]

##### Cleaning Metadata

In [3]:
# Remove price columns starting .a-box-inner

def filter_junk(price):
    if price=='':
        return None
    if len(price)>=12:
        if price[0:12]=='.a-box-inner':
            return None
    return price

combined_df.price=combined_df.price.apply(filter_junk)

In [4]:
# Add missing_price feature
# Might indicate item is no longer for sale/out of stock?
combined_df['missing_price']=combined_df.price.isna()

In [5]:
# Remove price
combined_df=combined_df.drop(columns='price')

In [6]:
# Replace category with top-level subcategory
def extract_subcategory(cat):
    if len(cat)>0:
        return cat[1]
    return None

combined_df['category']=combined_df.category.progress_apply(extract_subcategory)

100%|██████████| 633883/633883 [00:00<00:00, 747588.41it/s]


In [7]:
# Combine also_buy and also_view into a single list
def combine(entry):
    output=entry.also_buy+entry.also_view
    output=list(set(output)) # Remove duplicates
    if output==[]:
        return None
    return output
    
combined_df['similar']=combined_df.progress_apply(combine,axis=1)

  0%|          | 0/633883 [00:00<?, ?it/s]

100%|██████████| 633883/633883 [00:27<00:00, 23294.42it/s]


In [8]:
# We now fill in missing categories
# We create a lookup table of all items that already have a category.
lookup_cat=dict()
for index,entry in combined_df[combined_df.category.notna()].iterrows():
    lookup_cat[entry.asin]=entry.category

In [9]:
# The following function predicts the category using the category of similar (also_view or also_buy) products.
# We pick the most frequently ocurring such category
from collections import Counter

def most_frequent(lst):
    return Counter(lst).most_common(1)[0][0]

def predict_category_similar_prods(entry,i=0):
    if entry.category!=None:
        return entry.category
    similar_prods=entry.similar
    if similar_prods==None:
        return None
    similar_categories=[]
    for prod in similar_prods:
        category=lookup_cat.get(prod,-1)
        if category!=-1:
            similar_categories.append(category)
    if len(similar_categories)>0:
        return most_frequent(similar_categories)
    return None

In [10]:
# Fill in as many missing category values as possible
combined_df.category=combined_df.apply(predict_category_similar_prods,axis=1)

In [11]:
# Drop non-hashable columns that are no longer needed
combined_df=combined_df.drop(columns=['also_buy','also_view','similar'])

In [12]:
# Drop duplicate rows
combined_df=combined_df.drop_duplicates()

In [13]:
# Verify asins are unique
assert(combined_df.asin.duplicated().unique()==[False])

In [14]:
# Import the ranks.
ranks_df=pd.read_parquet("../Data/meta_ranks.parquet")
ranks_df=ranks_df.drop_duplicates()

# Verify no duplicated asins
assert(ranks_df.asin.duplicated().unique()==[False])

In [15]:
# Merge ranks with combined_df
combined_df=combined_df.merge(ranks_df,on='asin')

In [16]:
# Drop missing items with missing rank or missing category
pd.DataFrame.dropna(combined_df,axis=0,subset=['category','item_rank'],inplace=True)

##### Add matches and components

In [17]:
# Import matches with components
matches_df=pd.read_csv("../Data/amazon_df_labels_with_comps.csv",index_col=0)

In [18]:
# Merge matches with combined_df
combined_df=combined_df.merge(matches_df,on='asin')

In [19]:
# Import undropped asins after cleaning review_data.
review_cleaned_asins=pd.read_csv("../Data/asin_labels_clean_review_df.csv")

# Check no duplicate asin
assert(review_cleaned_asins.asin.duplicated().unique()==[False])

In [20]:
# Drop entries from combined_df that don't appear in review_cleaned_asins
combined_df=combined_df.merge(review_cleaned_asins[['asin']],on='asin')

##### Add reviews features

In [21]:
# Load Reviews_df (from reviews_features.ipynb)
reviews_features_df=pd.read_parquet("final_reviews.parquet")

# Verify no duplicated asins
assert(reviews_features_df.asin.duplicated().unique()==[False])

In [22]:
# Merge meta and review datasets
combined_df=combined_df.merge(reviews_features_df,on='asin')

##### Add embeddings

In [23]:
# Load embeddings
embeddings=pd.read_pickle("../Data/agg_summary_embeddings.pkl")

# Verify no duplicated asins
assert(embeddings.asin.duplicated().unique()==[False])

In [24]:
combined_df=combined_df.merge(embeddings,on='asin',how='left').set_axis(combined_df.index)

##### Randomly dropping items labelled zero

To obtain a more manageable data set, we randomly drop items whose ``match`` column is zero.

In [25]:
num_zeros=(combined_df.match==0).sum()
num_ones=(combined_df.match==1).sum()
print(f"There are {num_zeros} products labelled 0.")
print(f"There are {num_ones} products labelled 1.")

There are 546348 products labelled 0.
There are 1370 products labelled 1.


In [26]:
# This is the number of 0s in our final dataset.
TARGET=200_000

# Check TARGET is less than the total number of zeros
assert(TARGET<num_zeros)

In [27]:
# indices of items labelled zero
zero_indices=combined_df[combined_df.match==0].index
# indices of items labelled one
one_indices=combined_df[combined_df.match==1].index

# random subset of TARGET items labelled zero
rng=np.random.default_rng(seed=1067)
random_subset=rng.choice(zero_indices,TARGET,replace=False)

# total list of indices
indices=np.concat([random_subset,one_indices])

In [28]:
# Drop rows of combined_df with index not in indices
combined_df=combined_df.loc[indices]

In [29]:
print(combined_df.match.value_counts())

match
0    200000
1      1370
Name: count, dtype: int64


##### Stratified train-test split preserving groups

In [30]:
from custom_ttsplit import StratifiedGroupSplit

In [31]:
df_train,df_test=StratifiedGroupSplit(combined_df,'match','component_no',test_size=0.2,random_state=1066)

In [32]:
# Check test ratio
df_test.shape[0]/combined_df.shape[0]

0.2

In [33]:
# Check ratio of 1s in set before split
combined_df[(combined_df.match)==1].shape[0]/combined_df.shape[0]

0.0068033967323831756

In [34]:
# Check ratio of 1s in test set
df_test[(df_test.match)==1].shape[0]/df_test.shape[0]

0.0068033967323831756

In [35]:
# Check ratio of 1s in training set
df_train[(df_train.match)==1].shape[0]/df_train.shape[0]

0.0068033967323831756

These numbers are all very close.

In [36]:
# Check there are no component overlaps
comps_in_test=set(df_test.component_no.unique())
comps_in_train=set(df_train.component_no.unique())
assert(comps_in_train.intersection(comps_in_test)==set())

In [37]:
# Do not change these files unless the train-test split is changed
# df_train[['asin']].to_parquet("../Data/train_asins.parquet",compression='gzip')
# df_test[['asin']].to_parquet("../Data/test_asins.parquet",compression='gzip')

In [38]:
# Check we haven't changed the train-test split asins from the split on 13 Jun
saved_train=pd.read_parquet("../Data/train_asins.parquet")
saved_test=pd.read_parquet("../Data/test_asins.parquet")
assert(saved_train.shape[0]==df_train.shape[0])
assert((saved_train.asin!=df_train.asin).sum()==0)
assert(saved_test.shape[0]==df_test.shape[0])
assert((saved_test.asin!=df_test.asin).sum()==0)

##### Save to compressed parquet

In [None]:
df_train.to_parquet("../Data/train_v1.parquet", compression='gzip')

In [None]:
df_test.to_parquet("../Data/test_v1.parquet", compression='gzip')

In [None]:
# # Save md5sums of files
# import hashlib

# def calculate_md5(filepath):
#     md5_hash = hashlib.md5()
#     with open(filepath, "rb") as file:
#         # Read the file in chunks to handle large files
#         for chunk in iter(lambda: file.read(4096), b""):
#             md5_hash.update(chunk)
#     return md5_hash.hexdigest()

# os.chdir("../Data/")
# file_list=["df_train_v1.parquet","df_test_v1.parquet"]
# output_file = "../Data/md5_checksums.txt"
# with open(output_file, "w") as f:
#     for file_path in file_list:
#         md5_value = calculate_md5(file_path)
#         f.write(f"{md5_value}  {file_path}\n")
# os.chdir("../feature_extractions")