# Training Data & Feature views

In [1]:
import pandas as pd
import numpy as np
import datetime
import hopsworks
import random
import gdown
import pickle

  from .autonotebook import tqdm as notebook_tqdm


### Load the data using pickle 

In [2]:
# Load the DataFrame
with open('ratebeer.pkl', 'rb') as file:
    df = pickle.load(file)

### Connecting to Hopsworks

In [3]:
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/550037




Connected. Call `.close()` to terminate connection gracefully.


## <span style="color:#ff5f27;"> ⚙️ Feature View Creation </span>

In [30]:
def create_user_beer_feature_view(fs, version=1):
    # Load feature groups
    beer_fg = fs.get_feature_group('beer_features', version=version)
    review_fg = fs.get_feature_group('review_features', version=version)
    agg_reviews_fg = fs.get_feature_group('agg_reviews', version=version)
    reviewer_metrics_fg = fs.get_feature_group('reviewer_metrics', version=version)
    
    # Define the join queries properly, excluding review_time and review_count from trans_fg
    ds_query = beer_fg.select_all()\
        .join(review_fg.select_except(["review_time"]), on=['beer_beerid', 'review_profilename'])\
        .join(agg_reviews_fg.select_except(["review_count"]), on='beer_beerid')\
        .join(reviewer_metrics_fg.select_all(), on='review_profilename')
    
    # Define transformation functions for relevant features
    transformation_functions = {
        'review_aroma': fs.get_transformation_function(name='min_max_scaler'),
        'review_taste': fs.get_transformation_function(name='min_max_scaler'),
        'review_appearance': fs.get_transformation_function(name='min_max_scaler'),
        'review_palate': fs.get_transformation_function(name='min_max_scaler'),  
    }
    
    # Create and return the feature view
    return fs.create_feature_view(
        name='user_beer_feature_view',
        version=version,
        query=ds_query,
        labels=['review_overall'],  # Assuming 'review_overall' is the target variable
        transformation_functions=transformation_functions,
        description="Feature view aggregating user and beer features with review metrics"
    )

# Create the feature view
try:
    user_beer_feature_view = create_user_beer_feature_view(fs)
    print("User and beer feature view created successfully.")
except Exception as e:
    print("Failed to create user and beer feature view:", e)

[<hsfs.feature_group.FeatureGroup object at 0x16a8157f0>]
[<hsfs.feature_group.FeatureGroup object at 0x16a9d7410>]
[<hsfs.feature_group.FeatureGroup object at 0x16a9d4dd0>]
[<hsfs.feature_group.FeatureGroup object at 0x16a9d76b0>]
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fv/user_beer_feature_view/version/1
User and beer feature view created successfully.


In [31]:
import datetime

# Assuming 'data' is your DataFrame containing the 'review_time' column
data = df.sort_values(by='review_time')

def split_dfs(df): 
    trainvals = df.iloc[:int(len(df)*0.8)] 
    testvals = df.iloc[int(len(df)*0.8):] 
    return {'train_start': min(trainvals.review_time).date(), 
            'train_end': max(trainvals.review_time).date(), 
            'test_start': min(testvals.review_time).date(), 
            'test_end': max(testvals.review_time).date()}

split_dict = split_dfs(data)

In [32]:
# Create training dataset based on event time filter
td_train_version, td_job_train = user_beer_feature_view.create_training_data(
    start_time=split_dict["train_start"],
    end_time=split_dict["train_end"],
    description='Training dataset for user beer ratings',
    data_format="csv",
    coalesce=True,
    write_options={'wait_for_job': False},
)

# Create test dataset based on event time filter
td_test_version, td_job_test = user_beer_feature_view.create_training_data(
    start_time=split_dict["test_start"],
    end_time=split_dict["test_end"],
    description='Test dataset for user beer ratings',
    data_format="csv",
    coalesce=True,
    write_options={'wait_for_job': False},
)

Training dataset job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/user_beer_feature_view_1_create_fv_td_06052024185240/executions




Training dataset job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/user_beer_feature_view_1_create_fv_td_06052024185246/executions


