# Training Data & Feature views

In [None]:
import pandas as pd
import numpy as np
import datetime
import hopsworks
import random
import gdown

In [None]:
project = hopsworks.login()
fs = project.get_feature_store()

## <span style="color:#ff5f27;"> ⚙️ Feature View Creation </span>

In [None]:
def create_beer_rating_feature_view(fs, version=1):
    # Retrieve feature groups
    beer_fg = fs.get_feature_group('beer_features', version=version)
    review_fg = fs.get_feature_group('review_features', version=version)
    reviewer_fg = fs.get_feature_group('reviewer_metrics', version=version)
    
    # Define the join queries properly
    ds_query = beer_fg.select_all()\
        .join(review_fg.select_all(), on='beer_beerid')\
        .join(reviewer_fg.select_all(), on='review_profilename')
    
    # Define transformation functions for relevant features
    transformation_functions = {
        'review_aroma': fs.get_transformation_function(name='min_max_scaler'),
        'review_taste': fs.get_transformation_function(name='min_max_scaler'),
        # Additional transformations can be defined as needed
    }
    
    # Create and return the feature view
    return fs.create_feature_view(
        name='beer_rating_feature_view',
        query=ds_query,
        labels=['review_overall'],  # assuming review_overall is your target variable
        transformation_functions=transformation_functions
    )

# Attempt to create or retrieve the feature view
version=1
try:
    feature_view = fs.get_feature_view("beer_rating_feature_view", version=version)
except Exception as e:
    print(e)
    feature_view = create_beer_rating_feature_view(fs, version)

In [None]:
import datetime

# Retrieve feature groups
beer_fg = fs.get_feature_group('beer_features', version=1)
review_fg = fs.get_feature_group('review_features', version=1)
reviewer_fg = fs.get_feature_group('reviewer_metrics', version=1)

# Use the review_fg variable you've just defined
pdf = review_fg.read()

def split_dfs(df):
    # Convert 'review_time' to datetime and ensure it's in the correct format
    df['datetime'] = df['review_time'].map(lambda x: datetime.datetime.fromtimestamp(x // 1000))
    df = df.sort_values(by='datetime')
    
    # Define the split point
    trainvals = df[:int(len(df) * 0.8)]
    testvals = df[int(len(df) * 0.8):]
    
    # Return the split as date ranges
    return {
        'train_start': min(trainvals['datetime']).date(),
        'train_end': max(trainvals['datetime']).date(),
        'test_start': min(testvals['datetime']).date(),
        'test_end': max(testvals['datetime']).date()
    }

# Apply the split function
split_dict = split_dfs(pdf)