In [1]:
import os
import hopsworks
import pandas as pd
import numpy as np

In [2]:
with open('data/hopsworks-api-key.txt', 'r') as file:
    os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()
    
project = hopsworks.login()
fs = project.get_feature_store()

2024-12-29 13:30:25,138 INFO: Initializing external client
2024-12-29 13:30:25,139 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-29 13:30:27,053 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1159324


In [3]:
grailed_items_fg = fs.get_feature_group(
    name='grailed_items',
    version=1,
)

In [4]:
grailed_items_fg.features

[Feature('id', 'bigint', None, True, False, None, None, 1394323),
 Feature('sold_at', 'timestamp', None, False, False, None, None, 1394323),
 Feature('category_path', 'string', None, False, False, None, None, 1394323),
 Feature('description', 'string', None, False, False, None, None, 1394323),
 Feature('size', 'string', None, False, False, None, None, 1394323),
 Feature('color', 'string', None, False, False, None, None, 1394323),
 Feature('sold_price', 'bigint', None, False, False, None, None, 1394323),
 Feature('designers_title', 'string', None, False, False, None, None, 1394323),
 Feature('designers_title_embedding', 'array<float>', None, False, False, None, None, 1394323),
 Feature('condition_ordinal', 'bigint', None, False, False, None, None, 1394323)]

## Create feature view

In [5]:
selected_features = grailed_items_fg.select(['sold_price', 'designers_title_embedding', 'category_path', 'size', 'color', 'condition_ordinal'])
selected_features.show(10)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.04s) 


Unnamed: 0,sold_price,designers_title_embedding,category_path,size,color,condition_ordinal
0,70,"[-0.0035999876, -0.014603166, -0.0431731, -0.0...",tops.short_sleeve_shirts,l,white,1
1,50,"[-0.011395679, 0.020809747, 0.007503037, 0.022...",bottoms.casual_pants,36,black,1
2,150,"[0.019931091, 0.018887337, 0.003918661, 0.0048...",bottoms.denim,32,brown,1
3,360,"[-0.0048367777, 0.018785886, -0.015241521, -0....",accessories.belts,34,black,2
4,150,"[-0.029613871, -0.04543135, 0.007882361, -0.04...",footwear.hitop_sneakers,8,grey,1
5,85,"[-0.0017983527, -0.053600196, 0.031824138, 0.0...",accessories.hats,28,black,1
6,144,"[-0.030162571, -0.0075903414, 0.047487054, -0....",accessories.jewelry_watches,one size,silver,3
7,210,"[-0.012790641, 0.010913067, -0.0066601876, 0.0...",accessories.wallets,one size,black,3
8,75,"[-0.01947031, 0.032118134, 0.04220958, -0.0403...",bottoms.sweatpants_joggers,30,black,2
9,125,"[-0.045513783, 0.015168834, -0.005308694, 0.01...",tops.short_sleeve_shirts,m,black,2


In [6]:
feature_view = fs.get_or_create_feature_view(
    name='grailed_items_fv',
    version=2,
    labels=['sold_price'],
    query=selected_features,
)

In [7]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(test_size=0.2)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.34s) 




In [8]:
X_train

Unnamed: 0,designers_title_embedding,category_path,size,color,condition_ordinal
0,"[-0.0035999876, -0.014603166, -0.0431731, -0.0...",tops.short_sleeve_shirts,l,white,1
1,"[-0.011395679, 0.020809747, 0.007503037, 0.022...",bottoms.casual_pants,36,black,1
3,"[-0.0048367777, 0.018785886, -0.015241521, -0....",accessories.belts,34,black,2
4,"[-0.029613871, -0.04543135, 0.007882361, -0.04...",footwear.hitop_sneakers,8,grey,1
5,"[-0.0017983527, -0.053600196, 0.031824138, 0.0...",accessories.hats,28,black,1
...,...,...,...,...,...
94,"[-0.0008724189, 0.05182697, -0.020572301, -0.0...",tops.sweaters_knitwear,m,black navy,2
95,"[-0.0068363273, -0.020503161, -0.023378907, -0...",accessories.belts,34,black,2
96,"[-0.058070865, 0.033359855, 0.006958423, 0.032...",accessories.belts,one size,brown,3
97,"[-0.0624573, -0.015852991, -0.00038503917, -0....",accessories.jewelry_watches,one size,white,2


## Create a predictive model

Candidates include: XGBoost, CatBoost (handles categorical features natively), KNN or a neural net.

In [9]:
# Convert embeddings while preserving index
embedding_cols = pd.DataFrame(
    np.vstack(X_train['designers_title_embedding'].to_numpy()),
    columns=[f'embedding_{i}' for i in range(len(X_train['designers_title_embedding'].iloc[0]))],
    index=X_train.index  # Preserve the index
)

# Create processed dataframes with index alignment
X_train_processed = pd.concat([
    X_train.drop('designers_title_embedding', axis=1),
    embedding_cols
], axis=1)

# Do the same for test data
embedding_cols_test = pd.DataFrame(
    np.vstack(X_test['designers_title_embedding'].to_numpy()),
    columns=[f'embedding_{i}' for i in range(len(X_test['designers_title_embedding'].iloc[0]))],
    index=X_test.index  # Preserve the index
)
X_test_processed = pd.concat([
    X_test.drop('designers_title_embedding', axis=1),
    embedding_cols_test
], axis=1)

# Make sure indices match
X_train_processed = X_train_processed.loc[y_train.index]
X_test_processed = X_test_processed.loc[y_test.index]

raw_categorical_features = ['category_path', 'size', 'color']

In [10]:
from catboost import CatBoostRegressor, Pool

# Initialize the model
model = CatBoostRegressor(
    cat_features=raw_categorical_features,
    iterations=1000,
    # learning_rate=0.1,
    # depth=6,
    loss_function='RMSE',
    verbose=100  # Print training progress every 100 iterations
)

# Create CatBoost Pool objects
train_pool = Pool(
    data=X_train_processed,
    label=y_train,
    cat_features=raw_categorical_features
)

test_pool = Pool(
    data=X_test_processed,
    label=y_test,
    cat_features=raw_categorical_features
)

# Train the model
model.fit(
    train_pool,
    eval_set=test_pool,
    early_stopping_rounds=50
)

Learning rate set to 0.034252
0:	learn: 119.3052644	test: 72.1690833	best: 72.1690833 (0)	total: 92.8ms	remaining: 1m 32s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 69.05351645
bestIteration = 37

Shrink model to first 38 iterations.


<catboost.core.CatBoostRegressor at 0x7fceea2a59d0>

In [11]:
from sklearn.metrics import mean_squared_error, r2_score

# Predicting target values on the test set
y_pred = model.predict(X_test_processed)

# Calculating Mean Squared Error (MSE) using sklearn
mse = mean_squared_error(y_test.iloc[:,0], y_pred)
print("MSE:", mse)

# Calculating R squared using sklearn
r2 = r2_score(y_test.iloc[:,0], y_pred)
print("R squared:", r2)

MSE: 4768.388133434634
R squared: -0.27711198351637445


In [12]:
# TODO: Is the order correct?
predicted_df = y_test
predicted_df['predicted_price'] = y_pred

In [13]:
predicted_df

Unnamed: 0,sold_price,predicted_price
2,150,143.808986
9,125,119.229408
25,16,114.320981
26,58,128.80229
32,70,106.582098
40,43,132.053062
52,140,113.228035
63,24,131.931554
64,90,128.370765
65,100,124.584424


### "Outdated" models

In [None]:
import numpy as np
from xgboost import XGBRegressor

# Convert the embeddings column into a numpy array of separate columns
X_train_array = np.vstack(X_train['designers_title_embedding'].to_numpy())
X_test_array = np.vstack(X_test['designers_title_embedding'].to_numpy())

# Now train the model with the reshaped data
xgb_regressor = XGBRegressor()
xgb_regressor.fit(X_train_array, y_train)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train_array, y_train)

In [None]:
# TODO: Plot performance, consider saving the plots as images

In [None]:
# TODO: Plot feature importance

## Model registry

In [None]:
# TODO: Save the model locally

In [None]:
# TODO: Save the model to model registry