In [1]:
import hopsworks
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#ML imports
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import ensemble
from sklearn.metrics import mean_squared_error

#model upload imports
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
import joblib
import os

In [2]:
import datetime
year = datetime.date.today().year
year

2023

In [3]:
# You have to set the environment variable 'HOPSWORKS_API_KEY' for login to succeed
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/224406
Connected. Call `.close()` to terminate connection gracefully.


In [4]:
# The feature view is the input set of features for your model. The features can come from different feature groups.    
# You can select features from different feature groups and join them together to create a feature view

valuation_fg = fs.get_feature_group(name="valuationdataset", version=year-2022)
query = valuation_fg.select_all()
feature_view = fs.get_or_create_feature_view(name="valuationdataset_fv",
                                  version=year-2022,
                                  description="Read from valuation dataset",
                                  labels=["market_value_in_eur"],
                                  query=query)

In [5]:
# You can read training data, randomly split into train/test sets of features (X) and labels (y)  
data = False  
while not data: 
    try:   
        X_train, X_test, y_train, y_test = feature_view.train_test_split(0.0000001)
        #Drop date column, keeping it because might make sense later
        X_train = X_train.drop(columns=["player_id", "date"])
        X_test = X_test.drop(columns=["player_id", "date"])
        data = True
        
    except:
        pass

Finished: Reading data from Hopsworks, using ArrowFlight (1.55s) 




In [6]:
from sklearn.preprocessing import StandardScaler

# Assuming X is your features DataFrame
scaler = StandardScaler()
X_train[['age', 'height_in_cm', 'minutes_played']] = scaler.fit_transform(X_train[['age', 'height_in_cm', 'minutes_played']])
X_test[['age', 'height_in_cm', 'minutes_played']] = scaler.fit_transform(X_test[['age', 'height_in_cm', 'minutes_played']])

In [7]:
# Initialize a GradientBoostingRegressor
model = GradientBoostingRegressor(learning_rate=0.15, max_depth=4, min_samples_leaf=2, min_samples_split=8, n_estimators=100)
# Fit the new GridSearchCV
model.fit(X_train, y_train)

mse = mean_squared_error(y_test, model.predict(X_test))
rmse = np.sqrt(mse)  # Calculate the RMSE
print("RMSE: ", rmse)
mean = y_train.mean()["market_value_in_eur"]
print("mean: ", mean)



RMSE:  3326355.021959586
mean:  10335136.380390711


# upload to hopsworks

In [8]:
# We will now upload our model to the Hopsworks Model Registry. First get an object for the model registry.
mr = project.get_model_registry()

# The contents of the 'iris_model' directory will be saved to the model registry. Create the dir, first.
model_dir = f"valuation_model_v{year-1}_{year}"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)
    

# Save your model
joblib.dump(model, model_dir + "/valuation_model.pkl")

# Specify the schema of the model's input/output using the features (X_train) and labels (y_train)
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

# Create an entry in the model registry that includes the model's name, desc, metrics
valuation_model = mr.python.create_model(
    name=f"Valuation model season {year-1} to {year}", 
    metrics={"RMSE" : rmse, "Mean": mean},
    model_schema=model_schema,
    description=f"Valuation model based on data from the season {year-1}/{year}"
)

# Upload the model to the model registry, including all files in 'model_dir'
valuation_model.save(model_dir)

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1635 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/224406/models/valuation_model_v2022_2023/2


Model(name: 'valuation_model_v2022_2023', version: 2)