In [None]:
import pandas as pd
import tqdm

In [None]:
metadata_path = './data/abo-images-small/images/metadata/images.csv.gz'
listings_path = './data/abo-listings/listings/metadata/'

In [None]:
metadata_df = pd.read_csv(metadata_path, compression='gzip')

In [None]:
metadata_df.head()

In [None]:
listings_df = pd.DataFrame()

df_list = []
for i in tqdm.tqdm(range(10)):
    df_list.append(pd.read_json(f'{listings_path}listings_{i}.json.gz', lines=True, compression='gzip'))

for i in tqdm.tqdm(['a', 'b', 'c', 'd', 'e', 'f']):
    df_list.append(pd.read_json(f'{listings_path}listings_{i}.json.gz', lines=True, compression='gzip'))

listings_df = pd.concat(df_list)

In [None]:
listings_df.head()

In [None]:
listings_df.columns

In [None]:
merged_df = listings_df.drop_duplicates(subset= 'main_image_id').merge(metadata_df, left_on= 'main_image_id', right_on='image_id')

In [None]:
merged_df['node'][54094]

In [None]:
merged_df.columns

In [None]:
# drop unnecessary columns

filtered_df = merged_df[['item_name', 'country', 'node', 'path']]

In [None]:
# filter out any non US products

filtered_df = filtered_df[filtered_df['country'] == 'US']

In [None]:
# remove unnecessary metadata and store only string value title and categories

def extract_node_name(nodes):
    try:
        if nodes and isinstance(nodes, list):  # Check if 'nodes' is not None or empty, and is a list
            if isinstance(nodes[0], dict) and 'node_name' in nodes[0]:  # Check if the first item is a dictionary and has 'node_name'
                return nodes[0]['node_name']
    except Exception as e:
        print(f"Error: {e}")  # Print any exceptions that occur to help with debugging
    return pd.NA  # Return None if any conditions fail

def get_en_us_name(item_names):
    for item in item_names:
        if item['language_tag'] == 'en_US':
            return item['value']
    return pd.NA  # Return None  if 'en_US' is not found

filtered_df['node_name'] = filtered_df['node'].apply(extract_node_name)
filtered_df['product_name'] = filtered_df['item_name'].apply(get_en_us_name)
filtered_df = filtered_df.dropna()

filtered_df['categories'] = filtered_df['node_name'].str.split('/').apply(lambda x: [item for item in x if item]) 

# Remove rows where 'node_name' is either None or an empty string
# filtered_df = filtered_df[filtered_df['node_name'].notna() & (filtered_df['node_name'] != '')]

In [None]:
filtered_df.tail()

In [None]:
final_df = filtered_df[['product_name', 'categories', 'path']]

In [None]:
final_df.tail()

In [None]:
final_df['categories'] = final_df['categories'].apply(lambda x: x[1:] if len(x) > 0 else pd.NA)

In [None]:
final_df = final_df.dropna()

In [None]:
final_df

In [None]:
final_df['categories'].apply(len).max()

In [None]:
final_df['categories'] = final_df['categories'].apply(lambda x: x + ['UNK'] * (6 - len(x)) if len(x) < 6 else x)

In [None]:
final_df.head()

In [None]:
final_df['categories'].apply(len).min()

In [None]:
for i in range(0,6):
    final_df[f'category_{i+1}'] = final_df['categories'].apply(lambda x: x[i])

In [None]:
final_df.head()

In [None]:
final_df.drop('categories', axis=1, inplace=True)

In [None]:
final_df.head()

In [None]:
final_df.to_csv('dataset/dataset.csv')