In [1]:
import hopsworks


In [2]:
from utils.feature_engineering import get_latest_sold_products
get_latest_sold_products()[0]

{'badges': [],
 'bumped_at': '2024-11-14T08:08:59.642Z',
 'buynow': True,
 'category_path_size': 'accessories.glasses.one_size',
 'category_path': 'accessories.glasses',
 'category_size': 'accessories.one_size',
 'category': 'accessories',
 'color': 'black',
 'condition': 'is_gently_used',
 'cover_photo': {'id': 499271392,
  'listing_id': 69721474,
  'image': None,
  'created_at': '2024-10-31T21:22:14.129Z',
  'updated_at': '2024-10-31T21:22:14.139Z',
  'photoable_id': 69721474,
  'photoable_type': 'Listing',
  'width': None,
  'height': None,
  'url': 'https://media-assets.grailed.com/prd/listing/temp/261c9931296240198f86adecd55db705',
  'position': 0,
  'rotate': 0,
  'image_url': 'https://media-assets.grailed.com/prd/listing/temp/261c9931296240198f86adecd55db705'},
 'created_at_i': 1730201777,
 'created_at': '2024-10-29T11:36:17.537Z',
 'deleted': False,
 'department': 'menswear',
 'description': 'Product name:  Christian Dior glasses for prescription \n\n100% Authentic - Money Back

In [3]:
from dotenv import load_dotenv
load_dotenv()
    
project = hopsworks.login()

2025-01-05 18:58:50,652 INFO: Initializing external client
2025-01-05 18:58:50,654 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-05 18:58:52,187 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1159324


## Get data

### Quick data exploration
Taken from https://graildient-descent.streamlit.app/eda, using 10,000 samples.
We could do the same using the larger dataset.

#### Numerical/quantitative features
Target variable (sold_price):
- Most sold items are between 35-135$ (consider plotting bins) - consider outliers, since we are far from normal distribution, maybe try a log transformation?

Number of photos:
- Another numerical feature, could be added
- Price increases until 13 photos, then inconsistent

#### Categorical/qualitative features
- high-cardinality: designer, color, subcategory, size
- low-cardinality: category, condition

Target encoding more fitting for high-cardinality features. Whereas low-cardinality features could be one-hot encoded.

- designer a strong predictor
- we skip department, to focus on men's clothing only (it has better representation)
- there is good variation of sold prices in different subcategories => probably a good indicator
- could be interesting to use embeddings for color instead
- condition - perfect for ordinal encoding. The better the condition, the higher the average sold price.

##### Text
- title/description - should we do any pre-processing?
- we could look at sentiment and similar text analysis approaches

#### Images
- could be interesting to add embedding representation of title image

## Feature processing

In [3]:
from utils.feature_engineering import pipeline
df = pipeline(no_of_hits=500)
df.shape

embedding designer names
embedding descriptions
embedding titles
embedding hashtags
embedding size




shape: (499, 12)
┌──────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ id       ┆ sold_at   ┆ designer_ ┆ descripti ┆ … ┆ size      ┆ color     ┆ followern ┆ sold_pric │
│ ---      ┆ ---       ┆ names     ┆ on        ┆   ┆ ---       ┆ ---       ┆ o         ┆ e         │
│ i64      ┆ datetime[ ┆ ---       ┆ ---       ┆   ┆ list[f32] ┆ i64       ┆ ---       ┆ ---       │
│          ┆ μs]       ┆ list[f32] ┆ list[f32] ┆   ┆           ┆           ┆ i64       ┆ i64       │
╞══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 59318624 ┆ 2025-01-0 ┆ [-0.07145 ┆ [-0.04419 ┆ … ┆ [0.0,     ┆ 264690080 ┆ 50        ┆ 60        │
│          ┆ 5 14:00:5 ┆ 7,        ┆ 6,        ┆   ┆ 0.0, …    ┆ 9         ┆           ┆           │
│          ┆ 6.954     ┆ 0.007094, ┆ 0.009599, ┆   ┆ 0.0]      ┆           ┆           ┆           │
│          ┆           ┆ … 0.0685… ┆ … -0.018… ┆   ┆           ┆          

(499, 12)

## Save data

In [5]:
fs = project.get_feature_store() 

In [6]:
grailed_items_fg = fs.get_or_create_feature_group(
    name='draft_grailed_items',
    description='Sold Grailed items',
    version=4,
    primary_key=['id'],
    event_time="sold_at",
    # expectation_suite=aq_expectation_suite
)

In [7]:
grailed_items_fg.insert(df)

Uploading Dataframe: 100.00% |██████████| Rows 499/499 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: draft_grailed_items_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1159324/jobs/named/draft_grailed_items_4_offline_fg_materialization/executions


(Job('draft_grailed_items_4_offline_fg_materialization', 'SPARK'), None)

In [8]:
# TODO: Update feature description