In [1]:
import os
import hopsworks
import polars as pl

In [2]:
with open('data/hopsworks-api-key.txt', 'r') as file:
    os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()
    
project = hopsworks.login()

2025-01-05 15:09:17,065 INFO: Initializing external client
2025-01-05 15:09:17,066 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-05 15:09:19,144 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1159324


## Get data

In [3]:
from data_collection_preprocessing.load_data import pipeline
from datetime import datetime

In [4]:
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
raw_df = pipeline(no_of_hits=2000)
raw_df.write_parquet(f'data/grailed_items_{current_time}.parquet')

In [5]:
raw_df = pl.read_parquet(f'data/grailed_items_{current_time}.parquet')
raw_df

id,sold_at,category_path,condition,description,size,color,sold_price,designers_title
i64,datetime[μs],str,str,str,str,str,i64,str
71647532,2025-01-05 14:08:23.859,"""tops.sweaters_knitwear""","""is_gently_used""","""FlameWood 🎁 🔥OPEN TO ALL OFF…","""s""","""red""",35,"""Allsaints Streetwear Vintage: …"
73021106,2025-01-05 14:08:04.773,"""tops.button_ups""","""is_gently_used""","""Gitman vintage cotton tweed in…","""l""","""brown check""",100,"""Gitman Bros. Vintage: Cotton t…"
68262295,2025-01-05 14:07:10.215,"""bottoms.denim""","""is_new""","""Shipped same or next business …","""32""","""indigo""",50,"""Richie Le Collection: Richie L…"
72992745,2025-01-05 14:06:46.201,"""accessories.bags_luggage""","""is_gently_used""","""🫣🫣Please check all Photos 🤗🤗al…","""one size""","""black red""",38,"""Freitag Japanese Brand Vintage…"
72984676,2025-01-05 14:05:30.309,"""tops.sweatshirts_hoodies""","""is_gently_used""","""Chrome Hearts Slo Ride Hoodie …","""xl""","""black""",645,"""Chrome Hearts: Chrome Hearts S…"
…,…,…,…,…,…,…,…,…
71251242,2025-01-05 04:38:16.677,"""tops.short_sleeve_shirts""","""is_new""","""Vengeance x Calido Scopic Coll…","""l""","""black""",100,"""Marvel Comics Streetwear Vinta…"
70156293,2025-01-05 04:37:40.605,"""accessories.hats""","""is_gently_used""","""composition: 65% wool + 35% vi…","""one size""","""green""",15,"""Japanese Brand Rolex Vintage: …"
72184683,2025-01-05 04:36:59.707,"""tops.jerseys""","""is_new""","""Nike New England Patriots Brad…","""xl""","""white/blue""",80,"""Jersey NFL Vintage: New Englan…"
71194299,2025-01-05 04:36:41.681,"""tops.short_sleeve_shirts""","""is_used""","""Condition good overall, has so…","""xxl""","""brown""",26,"""Sportswear Streetwear Vintage:…"


### Quick data exploration
Taken from https://graildient-descent.streamlit.app/eda, using 10,000 samples.
We could do the same using the larger dataset.

#### Numerical/quantitative features
Target variable (sold_price):
- Most sold items are between 35-135$ (consider plotting bins) - consider outliers, since we are far from normal distribution, maybe try a log transformation?

Number of photos:
- Another numerical feature, could be added
- Price increases until 13 photos, then inconsistent

#### Categorical/qualitative features
- high-cardinality: designer, color, subcategory, size
- low-cardinality: category, condition

Target encoding more fitting for high-cardinality features. Whereas low-cardinality features could be one-hot encoded.

- designer a strong predictor
- we skip department, to focus on men's clothing only (it has better representation)
- there is good variation of sold prices in different subcategories => probably a good indicator
- could be interesting to use embeddings for color instead
- condition - perfect for ordinal encoding. The better the condition, the higher the average sold price.

##### Text
- title/description - should we do any pre-processing?
- we could look at sentiment and similar text analysis approaches

#### Images
- could be interesting to add embedding representation of title image

## Feature processing

In [6]:
from data_collection_preprocessing.embeddings import embed_text
from data_collection_preprocessing.load_data import item_condition_to_ordinal

In [7]:
# Represent designers_title as an embedding
df = raw_df.with_columns(
    pl.col('designers_title')
    .map_elements(embed_text, return_dtype=pl.List(pl.Float32))
    .alias('designers_title_embedding')
)

In [8]:
# Convert condition to ordinal number
df = df.with_columns(
    pl.col('condition')
    .map_elements(item_condition_to_ordinal, return_dtype=pl.UInt32)
    .alias('condition_ordinal')
)

# Drop the old condition column
df = df.drop(['condition'])

In [9]:
df

id,sold_at,category_path,description,size,color,sold_price,designers_title,designers_title_embedding,condition_ordinal
i64,datetime[μs],str,str,str,str,i64,str,list[f32],u32
71647532,2025-01-05 14:08:23.859,"""tops.sweaters_knitwear""","""FlameWood 🎁 🔥OPEN TO ALL OFF…","""s""","""red""",35,"""Allsaints Streetwear Vintage: …","[-0.00976, 0.056274, … -0.034127]",2
73021106,2025-01-05 14:08:04.773,"""tops.button_ups""","""Gitman vintage cotton tweed in…","""l""","""brown check""",100,"""Gitman Bros. Vintage: Cotton t…","[-0.015262, 0.019917, … -0.035356]",2
68262295,2025-01-05 14:07:10.215,"""bottoms.denim""","""Shipped same or next business …","""32""","""indigo""",50,"""Richie Le Collection: Richie L…","[-0.013938, -0.017665, … -0.039873]",3
72992745,2025-01-05 14:06:46.201,"""accessories.bags_luggage""","""🫣🫣Please check all Photos 🤗🤗al…","""one size""","""black red""",38,"""Freitag Japanese Brand Vintage…","[0.00719, 0.054017, … -0.026897]",2
72984676,2025-01-05 14:05:30.309,"""tops.sweatshirts_hoodies""","""Chrome Hearts Slo Ride Hoodie …","""xl""","""black""",645,"""Chrome Hearts: Chrome Hearts S…","[-0.021588, 0.005552, … -0.005691]",2
…,…,…,…,…,…,…,…,…,…
71251242,2025-01-05 04:38:16.677,"""tops.short_sleeve_shirts""","""Vengeance x Calido Scopic Coll…","""l""","""black""",100,"""Marvel Comics Streetwear Vinta…","[-0.030552, 0.029717, … -0.031746]",3
70156293,2025-01-05 04:37:40.605,"""accessories.hats""","""composition: 65% wool + 35% vi…","""one size""","""green""",15,"""Japanese Brand Rolex Vintage: …","[-0.052672, 0.018519, … -0.060684]",2
72184683,2025-01-05 04:36:59.707,"""tops.jerseys""","""Nike New England Patriots Brad…","""xl""","""white/blue""",80,"""Jersey NFL Vintage: New Englan…","[-0.00336, -0.00741, … -0.0177]",3
71194299,2025-01-05 04:36:41.681,"""tops.short_sleeve_shirts""","""Condition good overall, has so…","""xxl""","""brown""",26,"""Sportswear Streetwear Vintage:…","[-0.024623, 0.027894, … -0.063672]",1


## Save data

In [10]:
fs = project.get_feature_store() 

In [11]:
grailed_items_fg = fs.get_or_create_feature_group(
    name='grailed_items',
    description='Sold Grailed items',
    version=2,
    primary_key=['id'],
    event_time="sold_at",
    # expectation_suite=aq_expectation_suite
)

In [12]:
grailed_items_fg.insert(df)

Uploading Dataframe: 100.00% |███████████████████████████████████████| Rows 982/982 | Elapsed Time: 00:04 | Remaining Time: 00:00


Launching job: grailed_items_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1159324/jobs/named/grailed_items_2_offline_fg_materialization/executions


(Job('grailed_items_2_offline_fg_materialization', 'SPARK'), None)

In [13]:
# TODO: Update feature description