In [1]:
import os
import hopsworks
import polars as pl

In [2]:
with open('data/hopsworks-api-key.txt', 'r') as file:
    os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()
    
project = hopsworks.login()

2024-12-29 13:11:22,404 INFO: Initializing external client
2024-12-29 13:11:22,405 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-29 13:11:24,181 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1159324


## Get data

In [3]:
from data_collection_preprocessing.load_data import pipeline

In [4]:
df = pipeline(no_of_hits=100)

shape: (100, 9)
┌──────────┬────────────┬────────────┬────────────┬───┬──────────┬───────┬────────────┬────────────┐
│ id       ┆ sold_at    ┆ category_p ┆ condition  ┆ … ┆ size     ┆ color ┆ sold_price ┆ designers_ │
│ ---      ┆ ---        ┆ ath        ┆ ---        ┆   ┆ ---      ┆ ---   ┆ ---        ┆ title      │
│ i64      ┆ datetime[μ ┆ ---        ┆ str        ┆   ┆ str      ┆ str   ┆ i64        ┆ ---        │
│          ┆ s]         ┆ str        ┆            ┆   ┆          ┆       ┆            ┆ str        │
╞══════════╪════════════╪════════════╪════════════╪═══╪══════════╪═══════╪════════════╪════════════╡
│ 68525863 ┆ 2024-12-29 ┆ bottoms.de ┆ is_gently_ ┆ … ┆ 28       ┆ blue  ┆ 230        ┆ Amiri:     │
│          ┆ 10:27:50.5 ┆ nim        ┆ used       ┆   ┆          ┆       ┆            ┆ Amiri      │
│          ┆ 96         ┆            ┆            ┆   ┆          ┆       ┆            ┆ jeans size │
│          ┆            ┆            ┆            ┆   ┆          ┆       ┆ 

## Feature processing

In [5]:
from data_collection_preprocessing.embeddings import embed_text
from data_collection_preprocessing.load_data import item_condition_to_ordinal

In [6]:
# TODO: Represent categorical variables (we will most likely need a feature store for this, since we need to store the mappings)
# For now let's only take into account designers_title
# In future, account for category_path_size (as embeddings) and potentially separate embeddings for designers and title
# and 'condition' using one-hot encoding or as an ordinal number

# Represent designers_title as an embedding
df = df.with_columns(
    pl.col('designers_title')
    .map_elements(embed_text, return_dtype=pl.List(pl.Float32))
    .alias('designers_title_embedding')
)

# Represent condition as an ordinal number?

In [7]:
# Convert condition to ordinal number
df = df.with_columns(
    pl.col('condition')
    .map_elements(item_condition_to_ordinal, return_dtype=pl.UInt32)
    .alias('condition_ordinal')
)

# Drop the old condition column
df = df.drop(['condition'])

In [8]:
df

id,sold_at,category_path,description,size,color,sold_price,designers_title,designers_title_embedding,condition_ordinal
i64,datetime[μs],str,str,str,str,i64,str,list[f32],u32
68525863,2024-12-29 10:27:50.596,"""bottoms.denim""","""28 Lightly worn Fast shipping …","""28""","""blue""",230,"""Amiri: Amiri jeans size 28 MX1…","[-0.024531, 0.025716, … -0.057847]",2
64932590,2024-12-29 10:27:39.467,"""bottoms.denim""","""- Brand: Dsquared2 - Size: 50…","""33""","""black""",285,"""Dsquared2: Coated Cotton Cool …","[-0.039874, 0.024087, … -0.08562]",3
72695107,2024-12-29 10:26:06.951,"""tops.sweaters_knitwear""","""Very good condition""","""m""","""white""",35,"""Coloured Cable Knit Sweater La…","[-0.039036, 0.041019, … -0.050647]",2
65749322,2024-12-29 10:25:53.945,"""bottoms.denim""","""Bought from a consignment shop…","""34""","""blue""",14,"""Enyce: Jorts""","[-0.050827, -0.022776, … 0.017482]",2
69780192,2024-12-29 10:25:19.403,"""accessories.hats""","""Good vintage condition. Slight…","""one size""","""white""",22,"""Porsche Design Racing Streetwe…","[-0.055865, -0.018139, … 0.006345]",2
…,…,…,…,…,…,…,…,…,…
62291576,2024-12-29 09:01:14.075,"""tops.short_sleeve_shirts""","""Hello! The condition of the …","""xl""","""grey""",27,"""Nike Streetwear Vintage: Vinta…","[-0.040814, 0.023022, … -0.039927]",1
72546621,2024-12-29 09:00:54.741,"""accessories.wallets""","""Older item Interior usage Corn…","""one size""","""black""",100,"""Vivienne Westwood: Orb Card Ho…","[0.002611, -0.017359, … -0.023223]",1
72520578,2024-12-29 08:59:07.674,"""accessories.jewelry_watches""","""Item: in good working order, s…","""one size""","""white""",50,"""Marlboro Streetwear Vintage: Y…","[-0.062457, -0.015853, … -0.054628]",2
67157613,2024-12-29 08:58:25.621,"""bottoms.denim""","""32x32 Distressed denim is almo…","""32""","""blue""",58,"""Oro Los Angeles: ORO LOS ANGEL…","[0.026655, -0.000038, … -0.010498]",2


## Save data

In [9]:
fs = project.get_feature_store() 

In [10]:
grailed_items_fg = fs.get_or_create_feature_group(
    name='grailed_items',
    description='Sold Grailed items',
    version=1,
    primary_key=['id'],
    event_time="sold_at",
    # expectation_suite=aq_expectation_suite
)

In [11]:
grailed_items_fg.insert(df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1159324/fs/1150027/fg/1394323


Uploading Dataframe: 100.00% |█| Rows 100/100 | Elapsed Time: 00:02 | Remaining Time


Launching job: grailed_items_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1159324/jobs/named/grailed_items_1_offline_fg_materialization/executions


(Job('grailed_items_1_offline_fg_materialization', 'SPARK'), None)

In [None]:
# TODO: Update feature description