In [1]:
import os
import hopsworks
import polars as pl

In [2]:
with open('data/hopsworks-api-key.txt', 'r') as file:
    os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()
    
project = hopsworks.login()

2024-12-29 19:07:55,883 INFO: Initializing external client
2024-12-29 19:07:55,884 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-29 19:07:57,562 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1159324


## Get data

In [3]:
from data_collection_preprocessing.load_data import pipeline

In [4]:
raw_df = pipeline(no_of_hits=300)

shape: (300, 9)
┌──────────┬────────────┬────────────┬────────────┬───┬──────────┬───────┬────────────┬────────────┐
│ id       ┆ sold_at    ┆ category_p ┆ condition  ┆ … ┆ size     ┆ color ┆ sold_price ┆ designers_ │
│ ---      ┆ ---        ┆ ath        ┆ ---        ┆   ┆ ---      ┆ ---   ┆ ---        ┆ title      │
│ i64      ┆ datetime[μ ┆ ---        ┆ str        ┆   ┆ str      ┆ str   ┆ i64        ┆ ---        │
│          ┆ s]         ┆ str        ┆            ┆   ┆          ┆       ┆            ┆ str        │
╞══════════╪════════════╪════════════╪════════════╪═══╪══════════╪═══════╪════════════╪════════════╡
│ 58400938 ┆ 2024-12-29 ┆ outerwear. ┆ is_used    ┆ … ┆ s        ┆ black ┆ 175        ┆ Falcon     │
│          ┆ 16:11:08.0 ┆ leather_ja ┆            ┆   ┆          ┆       ┆            ┆ Garments:  │
│          ┆ 74         ┆ ckets      ┆            ┆   ┆          ┆       ┆            ┆ Falcon     │
│          ┆            ┆            ┆            ┆   ┆          ┆       ┆ 

### Quick data exploration
Taken from https://graildient-descent.streamlit.app/eda, using 10,000 samples.
We could do the same using the larger dataset.

#### Numerical/quantitative features
Target variable (sold_price):
- Most sold items are between 35-135$ (consider plotting bins) - consider outliers, since we are far from normal distribution, maybe try a log transformation?

Number of photos:
- Another numerical feature, could be added
- Price increases until 13 photos, then inconsistent

#### Categorical/qualitative features
- high-cardinality: designer, color, subcategory, size
- low-cardinality: category, condition

Target encoding more fitting for high-cardinality features. Whereas low-cardinality features could be one-hot encoded.

- designer a strong predictor
- we skip department, to focus on men's clothing only (it has better representation)
- there is good variation of sold prices in different subcategories => probably a good indicator
- could be interesting to use embeddings for color instead
- condition - perfect for ordinal encoding. The better the condition, the higher the average sold price.

##### Text
- title/description - should we do any pre-processing?
- we could look at sentiment and similar text analysis approaches

#### Images
- could be interesting to add embedding representation of title image

In [5]:
# Drop any rows with null values
df = raw_df.drop_nulls()
raw_df.shape, df.shape

((300, 9), (295, 9))

## Feature processing

In [6]:
from data_collection_preprocessing.embeddings import embed_text
from data_collection_preprocessing.load_data import item_condition_to_ordinal

In [7]:
# Represent designers_title as an embedding
df = df.with_columns(
    pl.col('designers_title')
    .map_elements(embed_text, return_dtype=pl.List(pl.Float32))
    .alias('designers_title_embedding')
)

# Represent condition as an ordinal number?

In [8]:
# Convert condition to ordinal number
df = df.with_columns(
    pl.col('condition')
    .map_elements(item_condition_to_ordinal, return_dtype=pl.UInt32)
    .alias('condition_ordinal')
)

# Drop the old condition column
df = df.drop(['condition'])

In [9]:
df

id,sold_at,category_path,description,size,color,sold_price,designers_title,designers_title_embedding,condition_ordinal
i64,datetime[μs],str,str,str,str,i64,str,list[f32],u32
58400938,2024-12-29 16:11:08.074,"""outerwear.leather_jackets""","""Worn lightly, in great conditi…","""s""","""black""",175,"""Falcon Garments: Falcon Garmen…","[0.006814, 0.012294, … -0.029637]",1
62708822,2024-12-29 16:10:56.007,"""bottoms.sweatpants_joggers""","""Fear Of God Essentials Drawstr…","""32""","""black""",133,"""FOG Fear of God: Fear of God E…","[0.013592, -0.005749, … -0.024911]",3
69613557,2024-12-29 16:10:13.033,"""bottoms.denim""","""Brand is Minus two. worn twice…","""32""","""black""",8,"""Corteiz Japanese Brand Other: …","[0.0089, 0.08929, … -0.00968]",2
71289134,2024-12-29 16:10:04.614,"""footwear.lowtop_sneakers""","""Nike Performance Zoom Fly 6309…","""14""","""blue""",60,"""Nike: Nike Mens Zoom Fly 63091…","[-0.015823, 0.01715, … -0.028978]",3
72120393,2024-12-29 16:09:29.441,"""footwear.boots""","""Elevate your footwear collecti…","""11""","""brown""",40,"""Clarks: Clarks Buckle Biker Br…","[-0.016412, 0.003058, … -0.054447]",2
…,…,…,…,…,…,…,…,…,…
50071057,2024-12-29 12:25:01.263,"""tops.short_sleeve_shirts""","""Vintage Archie comic character…","""xl""","""white""",40,"""Japanese Brand Streetwear Vint…","[-0.066469, 0.04695, … -0.066343]",1
70648522,2024-12-29 12:24:19.092,"""tops.short_sleeve_shirts""","""Like NEW. Worn once!""","""m""","""blue""",50,"""Kith: KITH Giants Tee""","[-0.052704, 0.006904, … 0.014113]",2
72114439,2024-12-29 12:20:04.548,"""tops.sweatshirts_hoodies""","""barely used, authentic bape sh…","""l""","""gray""",299,"""Bape: Bape Color Camo Full Zip…","[-0.042123, 0.016466, … -0.06163]",2
71443508,2024-12-29 12:18:16.782,"""accessories.hats""","""- The cap in perfect condition…","""one size""","""red""",59,"""Marlboro Streetwear Vintage: M…","[-0.053722, -0.010562, … -0.053374]",2


## Save data

In [10]:
fs = project.get_feature_store() 

In [11]:
grailed_items_fg = fs.get_or_create_feature_group(
    name='grailed_items',
    description='Sold Grailed items',
    version=2,
    primary_key=['id'],
    event_time="sold_at",
    # expectation_suite=aq_expectation_suite
)

In [12]:
grailed_items_fg.insert(df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1159324/fs/1150027/fg/1394342


Uploading Dataframe: 100.00% |██████| Rows 295/295 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: grailed_items_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1159324/jobs/named/grailed_items_2_offline_fg_materialization/executions


(Job('grailed_items_2_offline_fg_materialization', 'SPARK'), None)

In [None]:
# TODO: Update feature description