In [1]:
import pandas as pd
import os

## 1. Load Data

In [2]:
examples_path = os.path.join('..', 'data', 'shopping_queries_dataset_examples.parquet')
products_path = os.path.join('..', 'data', 'shopping_queries_dataset_products.parquet')
sources_path = os.path.join('..', 'data', 'shopping_queries_dataset_sources.csv')

examples = pd.read_parquet(examples_path)
products = pd.read_parquet(products_path)
sources = pd.read_csv(sources_path)

## 2. Clean and Split Data

Merging data and filtering down to English only by selecting locale = US

In [3]:
examples_products = pd.merge(
    examples,
    products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

examples_products = examples_products[examples_products['product_locale'] == 'us']

Separating dataset into reduced and large version - reduced version will be used for task 1, large version will be used for tasks 2 and 3

In [4]:
reduced = examples_products[examples_products['small_version'] == 1]
large = examples_products[examples_products['large_version'] == 1]

## 3. Reduced Data EDA

In [5]:
print(f'Reduced data shape: {reduced.shape}')

Reduced data shape: (601354, 14)


In [7]:
print('Columns and datatype info:')
print(reduced.info())

Columns and datatype info:
<class 'pandas.core.frame.DataFrame'>
Index: 601354 entries, 16 to 2618569
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   example_id            601354 non-null  int64 
 1   query                 601354 non-null  object
 2   query_id              601354 non-null  int64 
 3   product_id            601354 non-null  object
 4   product_locale        601354 non-null  object
 5   esci_label            601354 non-null  object
 6   small_version         601354 non-null  int64 
 7   large_version         601354 non-null  int64 
 8   split                 601354 non-null  object
 9   product_title         601354 non-null  object
 10  product_description   301110 non-null  object
 11  product_bullet_point  531226 non-null  object
 12  product_brand         571704 non-null  object
 13  product_color         416407 non-null  object
dtypes: int64(4), object(10)
memory usage: 68.8+ 

In [8]:
print(f'Null values:\n{reduced.isnull().sum()}')

Null values:
example_id                   0
query                        0
query_id                     0
product_id                   0
product_locale               0
esci_label                   0
small_version                0
large_version                0
split                        0
product_title                0
product_description     300244
product_bullet_point     70128
product_brand            29650
product_color           184947
dtype: int64
