# Training Data & Feature views

In [1]:
import pandas as pd
import numpy as np
import datetime
import hopsworks
import random
import gdown
import pickle

  from .autonotebook import tqdm as notebook_tqdm


### Load the data using pickle 

In [2]:
# Load the DataFrame
with open('ratebeer.pkl', 'rb') as file:
    df = pickle.load(file)

### Connecting to Hopsworks

In [3]:
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/550037




Connected. Call `.close()` to terminate connection gracefully.


## <span style="color:#ff5f27;"> ⚙️ Feature View Creation </span>

In [4]:
def create_user_beer_rating_feature_view(fs, version=1):
    # Retrieve feature groups
    beer_fg = fs.get_feature_group('beer_features', version=version)
    review_fg = fs.get_feature_group('review_features', version=version)
    agg_reviews_fg = fs.get_feature_group('agg_reviews', version=version)
    reviewer_metrics_fg = fs.get_feature_group('reviewer_metrics', version=version)
    
    # Define the join queries properly
    ds_query = beer_fg.select_all()\
        .join(review_fg.select_all(), on=['beer_beerid', 'review_profilename'])\
        .join(agg_reviews_fg.select_all(), on='beer_beerid')\
        .join(reviewer_metrics_fg.select_all(), on='review_profilename')
    
    # Define transformation functions for relevant features, exclude the label
    transformation_functions = {
        'review_aroma': fs.get_transformation_function(name='min_max_scaler'),
        'review_taste': fs.get_transformation_function(name='min_max_scaler'),
        'review_appearance': fs.get_transformation_function(name='min_max_scaler'),
        'review_palate': fs.get_transformation_function(name='min_max_scaler'),
    }
    
    # Create and return the feature view
    return fs.create_feature_view(
        name='user_beer_rating_feature_view',
        version=version,
        query=ds_query,
        labels=['review_overall'],  # assuming 'review_overall' is your target variable
        transformation_functions=transformation_functions,
        description="Feature view aggregating user ratings with beer characteristics and review metrics"
    )

# Try creating the feature view without transformation on labels
try:
    feature_view = create_user_beer_rating_feature_view(fs)
    print("Feature view created successfully.")
except Exception as e:
    print("Failed to create feature view:", e)

[<hsfs.feature_group.FeatureGroup object at 0x11290aa50>]
[<hsfs.feature_group.FeatureGroup object at 0x112dd1a90>]
[<hsfs.feature_group.FeatureGroup object at 0x16dd416d0>]
[<hsfs.feature_group.FeatureGroup object at 0x16dd429f0>]
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fv/user_beer_rating_feature_view/version/1
Feature view created successfully.


In [5]:
def split_dfs(df):
    # Sort DataFrame by 'review_time'
    df = df.sort_values(by='review_time') 

    # Define the split point
    trainvals = df.iloc[:int(len(df)*0.8)] 
    testvals = df.iloc[int(len(df)*0.8):] 

    # Extract date ranges
    return {
        'train_start': min(trainvals['review_time']).date(), 
        'train_end': max(trainvals['review_time']).date(), 
        'test_start': min(testvals['review_time']).date(), 
        'test_end': max(testvals['review_time']).date()
    }

split_dict = split_dfs(df)

In [7]:
# Create training datasets based event time filter
td_train_version, td_job = feature_view.create_training_data(
        start_time = split_dict["train_start"],
        end_time = split_dict["train_end"],    
        description = 'Training dataset for user beer rating model',
        data_format = "csv",
        coalesce = True,
        write_options = {'wait_for_job': False},
    )

ValueError: Cannot generate dataset(s) from the given start/end time because event time column is not available in the left feature groups. A start/end time should not be provided as parameters.