In [22]:
import hopsworks
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import numpy as np
import seaborn as sns
from matplotlib import pyplot
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
import joblib
import os
from sklearn.ensemble import RandomForestRegressor

In [2]:
# You have to set the environment variable 'HOPSWORKS_API_KEY' for login to succeed
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/194714
Connected. Call `.close()` to terminate connection gracefully.


In [4]:
# The feature view is the input set of features for your model. The features can come from different feature groups.    
# You can select features from different feature groups and join them together to create a feature view
flight_fg = fs.get_feature_group(name="flight_data", version=1)
query = flight_fg.select_all()
feature_view = fs.get_or_create_feature_view(name="flight_data",
                                  version=1,
                                  description="Read from Flight Delay dataset",
                                  labels=["dep_delay_new"],
                                  query=query)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/194714/fs/194633/fv/flight_data/version/1


In [5]:
# You can read training data, randomly split into train/test sets of features (X) and labels (y)        
X_train, X_test, y_train, y_test = feature_view.train_test_split(test_size=0.2)

Finished: Reading data from Hopsworks, using ArrowFlight (20.13s) 




#### Random Forest Regressor Model

In [None]:
# Initialize the Random Forest classifier
model_RF = RandomForestRegressor(n_estimators=100, max_depth=10,random_state=1)  # You can adjust parameters here

# Train the classifier
model_RF.fit(X_train, y_train)
# Predict labels for the test set
y_pred = model_RF.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")
print(f"Mean Squared Error (MSE): {mse}")

#### Linear Regression

In [26]:
# Initialize the Linear Regression model
model_lin_reg = LinearRegression()

# Fit the model on the training data
model_lin_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model_lin_reg.predict(X_test)

# Calculate the Mean Squared Error (MSE) on test set

mse = mean_squared_error(y_test, y_pred)
# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")
print(f"Mean Squared Error (MSE): {mse}")

RMSE: 50.6310994154544
MAE: 21.002474888964223
R-squared: 0.017153131048727288
Mean Squared Error (MSE): 2563.5082280176266


In [11]:
print(y_train['dep_delay_new'])

1            0.0
2           27.0
3            0.0
4           13.0
6            0.0
           ...  
1663626      0.0
1663627     17.0
1663628      7.0
1663629      0.0
1663630    119.0
Name: dep_delay_new, Length: 1330904, dtype: float64


In [12]:
print(X_train)

         year  quarter  month  day_of_month  day_of_week  origin_airport_id  \
1        2022        2      5            27            5              13930   
2        2021        4     11            10            3              13930   
3        2022        3      9            16            5              14100   
4        2021        1      1            20            3              11433   
6        2022        4     10            27            4              12892   
...       ...      ...    ...           ...          ...                ...   
1663626  2022        1      2            10            4              12892   
1663627  2021        2      5             4            2              11292   
1663628  2022        3      9            24            6              12478   
1663629  2022        4     12            10            6              12478   
1663630  2022        1      2            28            1              13204   

         origin_wac  dest_airport_id  dest_wac  crs

In [14]:
y_train = np.array(y_train['dep_delay_new'])  

#### Save to Hopsworks Model Registry 

[  0.  27.   0. ...   7.   0. 119.]


In [None]:
# We will now upload our model to the Hopsworks Model Registry. First get an object for the model registry.
mr = project.get_model_registry()

# The contents of the 'iris_model' directory will be saved to the model registry. Create the dir, first.
model_dir="flight delay model"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)

# Save both our model and the confusion matrix to 'model_dir', whose contents will be uploaded to the model registry
#joblib.dump(model, model_dir + "/flight_delay_model.pkl")
#fig.savefig(model_dir + "/confusion_matrix.png")    

# Specify the schema of the model's input/output using the features (X_train) and labels (y_train)
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

# Create an entry in the model registry that includes the model's name, desc, metrics
fligt_delay_model = mr.python.create_model(
    name="fligt_delay_model", 
    metrics={"accuracy" : metrics['accuracy']},
    model_schema=model_schema,
    description="Flight Delay Predictor"
)

# Upload the model to the model registry, including all files in 'model_dir'
fligt_delay_model.save(model_dir)