# Training Data & Feature views

In [30]:
import pandas as pd
import numpy as np
import datetime
import hopsworks
import random
import gdown
import pickle

### Load the data using pickle 

In [32]:
# Load the DataFrame
with open('ratebeer.pkl', 'rb') as file:
    df = pickle.load(file)

### Connecting to Hopsworks

In [33]:
project = hopsworks.login()
fs = project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/550037
Connected. Call `.close()` to terminate connection gracefully.


## <span style="color:#ff5f27;"> ⚙️ Feature View Creation </span>

In [36]:
def create_user_beer_rating_feature_view(fs, version=1):
    # Retrieve feature groups
    beer_fg = fs.get_feature_group('beer_features', version=version)
    review_fg = fs.get_feature_group('review_features', version=version)
    agg_reviews_fg = fs.get_feature_group('agg_reviews', version=version)
    reviewer_metrics_fg = fs.get_feature_group('reviewer_metrics', version=version)
    
    # Define the join queries properly
    ds_query = beer_fg.select_all()\
        .join(review_fg.select_all(), on=['beer_beerid', 'review_profilename'])\
        .join(agg_reviews_fg.select_all(), on='beer_beerid')\
        .join(reviewer_metrics_fg.select_all(), on='review_profilename')
    
    # Define transformation functions for relevant features, exclude the label
    transformation_functions = {
        'review_aroma': fs.get_transformation_function(name='min_max_scaler'),
        'review_taste': fs.get_transformation_function(name='min_max_scaler'),
        'review_appearance': fs.get_transformation_function(name='min_max_scaler'),
        'review_palate': fs.get_transformation_function(name='min_max_scaler'),
    }
    
    # Create and return the feature view
    return fs.create_feature_view(
        name='user_beer_rating_feature_view',
        version=version,
        query=ds_query,
        labels=['review_overall'],  # assuming 'review_overall' is your target variable
        transformation_functions=transformation_functions,
        description="Feature view aggregating user ratings with beer characteristics and review metrics"
    )

try:
    feature_view = create_user_beer_rating_feature_view(fs)
    print("Feature view created successfully.")
except Exception as e:
    print("Failed to create feature view:", e)

[<hsfs.feature_group.FeatureGroup object at 0x16f495d90>]
[<hsfs.feature_group.FeatureGroup object at 0x16ded16a0>]
[<hsfs.feature_group.FeatureGroup object at 0x16ec8e7b0>]
[<hsfs.feature_group.FeatureGroup object at 0x127bbfb30>]
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fv/user_beer_rating_feature_view/version/1
Feature view created successfully.


In [45]:
def create_simple_feature_view(fs, version=1):
    # Retrieve feature groups
    beer_fg = fs.get_feature_group('beer_features', version=version)
    review_fg = fs.get_feature_group('review_features', version=version)
    
    # Define the join query between beer_fg and review_fg
    ds_query = beer_fg.select_all().join(review_fg.select_all(), on=['beer_beerid', 'review_profilename'])
    
    # Define transformation functions for relevant features, if needed
    transformation_functions = {
        'review_aroma': fs.get_transformation_function(name='min_max_scaler'),
        'review_taste': fs.get_transformation_function(name='min_max_scaler'),
        'review_appearance': fs.get_transformation_function(name='min_max_scaler'),
        'review_palate': fs.get_transformation_function(name='min_max_scaler'),
    }
    
    # Create and return the feature view
    return fs.create_feature_view(
        name='simple_feature_view',
        version=version,
        query=ds_query,
        transformation_functions=transformation_functions,
        description="Simple feature view with beer and review features"
    )

try:
    simple_feature_view = create_simple_feature_view(fs)
    print("Simple feature view created successfully.")
except Exception as e:
    print("Failed to create simple feature view:", e)

[<hsfs.feature_group.FeatureGroup object at 0x16ded35f0>]
[<hsfs.feature_group.FeatureGroup object at 0x16ece1bb0>]
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fv/simple_feature_view/version/1
Simple feature view created successfully.


In [46]:
# Create training dataset with specified start and end dates
td_train_version, td_job_train = simple_feature_view.create_training_data(
    start_time="2000-04-12 00:00:00",
    end_time="2009-12-30 00:00:00",
    description='Training dataset for user beer ratings',
    data_format="csv",
    coalesce=True,
    write_options={'wait_for_job': False},
)

# Create test dataset with specified start and end dates
td_test_version, td_job_test = simple_feature_view.create_training_data(
    start_time="2009-12-30 00:00:00",
    end_time="2012-01-13 00:00:00",
    description='Test dataset for user beer ratings',
    data_format="csv",
    coalesce=True,
    write_options={'wait_for_job': False},
)

ValueError: Cannot generate dataset(s) from the given start/end time because event time column is not available in the left feature groups. A start/end time should not be provided as parameters.

In [39]:
def calculate_split_dates(df):
    # Sort the DataFrame by the 'review_time' column
    df_sorted = df.sort_values(by='review_time')
    
    # Calculate the index corresponding to 70% of the data
    seventy_percent_index = int(0.7 * len(df_sorted))
    
    # Get the timestamp at the 70% index
    seventy_percent_date = df_sorted.iloc[seventy_percent_index]['review_time']
    
    # Calculate the start and end dates for the training set
    start_date_training = df_sorted.iloc[0]['review_time']
    end_date_training = seventy_percent_date
    
    # Calculate the start and end dates for the test set
    start_date_test = seventy_percent_date
    end_date_test = df_sorted.iloc[-1]['review_time']
    
    return start_date_training, end_date_training, start_date_test, end_date_test

In [41]:
start_train, end_train, start_test, end_test = calculate_split_dates(df)
print("Training Set Start Date:", start_train)
print("Training Set End Date:", end_train)
print("Test Set Start Date:", start_test)
print("Test Set End Date:", end_test)


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)


Training Set Start Date: 2000-04-12 00:00:00
Training Set End Date: 2009-12-30 00:00:00
Test Set Start Date: 2009-12-30 00:00:00
Test Set End Date: 2012-01-13 00:00:00


In [44]:
# Create training dataset with specified start and end dates
td_train_version, td_job_train = feature_view.create_training_data(
    start_time="2000-04-12 00:00:00",
    end_time="2009-12-30 00:00:00",
    description='Training dataset for user beer ratings',
    data_format="csv",
    coalesce=True,
    write_options={'wait_for_job': False},
)

# Create test dataset with specified start and end dates
td_test_version, td_job_test = feature_view.create_training_data(
    start_time="2009-12-30 00:00:00",
    end_time="2012-01-13 00:00:00",
    description='Test dataset for user beer ratings',
    data_format="csv",
    coalesce=True,
    write_options={'wait_for_job': False},
)

ValueError: Cannot generate dataset(s) from the given start/end time because event time column is not available in the left feature groups. A start/end time should not be provided as parameters.

In [43]:
# Create training dataset based on calculated start and end dates
td_train_version, td_job_train = feature_view.create_training_data(
    description='Training dataset for user beer ratings',
    data_format="csv",
    coalesce=True,
    write_options={'wait_for_job': False},
)

# Create test dataset based on calculated start and end dates
td_test_version, td_job_test = feature_view.create_training_data(
    description='Test dataset for user beer ratings',
    data_format="csv",
    coalesce=True,
    write_options={'wait_for_job': False},
)


Training dataset job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/user_beer_rating_feature_view_1_create_fv_td_06052024165254/executions




Training dataset job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/user_beer_rating_feature_view_1_create_fv_td_06052024165301/executions


