## <span style="color:#ff5f27;"> 💽 Loading the Data </span>

In [1]:
#!pip install -U hopsworks --quiet

In [1]:
import numpy as np
import pandas as pd
import gdown

In [2]:
# Link to the dataset
url = 'https://drive.google.com/uc?id=1bxaIuvmGoCD8mOnlVJdATP0zvYlc_8e5'

# Output filename
output = 'ratebeer.csv'

# Download the file from the Google Drive link
gdown.download(url, output, quiet=False)

# Load the dataset into a pandas DataFrame
data = pd.read_csv(output)

Downloading...
From (original): https://drive.google.com/uc?id=1bxaIuvmGoCD8mOnlVJdATP0zvYlc_8e5
From (redirected): https://drive.google.com/uc?id=1bxaIuvmGoCD8mOnlVJdATP0zvYlc_8e5&confirm=t&uuid=859fb87f-c6ca-4867-b8da-a5cee7323527
To: /Users/lassehylleberg/Desktop/Data-Engineering-MLOps-Exam-Assignment/Data-Engineering-MLOps-Exam-Assignment/ratebeer.csv
100%|██████████| 1.13G/1.13G [00:55<00:00, 20.5MB/s]
  data = pd.read_csv(output)


In [3]:
# Check the first few rows of the dataframe
data.head()

Unnamed: 0,beer_name,beer_beerid,beer_brewerid,beer_abv,beer_style,review_appearance,review_aroma,review_palate,review_taste,review_overall,review_time,review_profilename,review_text
0,John Harvards Simcoe IPA,63836,8481,5.4,India Pale Ale (IPA),8.0,6.0,6.0,6.0,6.5,1157587200,hopdog,"On tap at the Springfield, PA location. Poured..."
1,John Harvards Simcoe IPA,63836,8481,5.4,India Pale Ale (IPA),8.0,6.0,8.0,7.0,6.5,1157241600,TomDecapolis,On tap at the John Harvards in Springfield PA....
2,John Harvards Cristal Pilsner,71716,8481,5.0,Bohemian Pilsener,8.0,5.0,6.0,6.0,7.0,958694400,PhillyBeer2112,"UPDATED: FEB 19, 2003 Springfield, PA. I've ne..."
3,John Harvards Fancy Lawnmower Beer,64125,8481,5.4,Kölsch,4.0,4.0,4.0,4.0,4.0,1157587200,TomDecapolis,On tap the Springfield PA location billed as t...
4,John Harvards Fancy Lawnmower Beer,64125,8481,5.4,Kölsch,4.0,4.0,4.0,4.0,4.0,1157587200,hopdog,"On tap at the Springfield, PA location. Poured..."


## <span style="color:#ff5f27;"> 🛠️ Feature Engineering </span>

In [4]:
# Categorical Encoding for Beer Style
data = pd.get_dummies(data, columns=['beer_style'], prefix='style')

In [5]:
# Converting the timestamp to datetime
data['review_datetime'] = pd.to_datetime(data['review_time'], unit='s')

In [6]:
# Extracting the year, month, and day of the week from the datetime
data['year'] = data['review_datetime'].dt.year
data['month'] = data['review_datetime'].dt.month
data['day_of_week'] = data['review_datetime'].dt.dayofweek  # Monday=0, Sunday=6

In [7]:
# aggregating the reviews
agg_reviews = data.groupby('beer_beerid').agg({
    'review_overall': ['mean', 'count'],
    'review_aroma': 'mean',
    'review_palate': 'mean',
    'review_taste': 'mean',
    'review_appearance': 'mean'
}).reset_index()
agg_reviews.columns = ['beer_beerid', 'avg_overall', 'review_count', 'avg_aroma', 'avg_palate', 'avg_taste', 'avg_appearance']
agg_reviews

Unnamed: 0,beer_beerid,avg_overall,review_count,avg_aroma,avg_palate,avg_taste,avg_appearance
0,13,6.886114,821,6.371498,6.730816,6.651644,7.049939
1,14,6.704887,266,6.259398,6.443609,6.383459,6.563910
2,15,7.215976,169,6.739645,6.970414,6.905325,7.337278
3,16,7.953747,1081,7.716004,7.574468,7.666050,7.922294
4,17,7.033994,353,6.691218,6.691218,6.781870,7.359773
...,...,...,...,...,...,...,...
87505,broyarde-l`eclipse-80766,6.333333,3,6.666667,6.000000,6.666667,7.333333
87506,broyarde-l`harfang-80769,5.250000,6,5.666667,5.000000,5.833333,5.666667
87507,la-saint-pierre-blonde-de-l`oncle-hansi-91690,5.666667,15,5.933333,5.733333,6.000000,5.866667
87508,s`tunnel-45990,6.818182,11,6.454545,6.909091,6.454545,7.636364


In [8]:
# Group data by reviewer names, count their reviews, and reset index to form a DataFrame with 'review_profilename' and 'review_count' columns.
reviewer_metrics = data.groupby('review_profilename').size().reset_index(name='review_count')
reviewer_metrics

Unnamed: 0,review_profilename,review_count
0,-BB-99,2
1,000pete1983,1
2,007BeerDrinker,1
3,007Lager,1
4,007lund,85
...,...,...
28681,zygomatic99,12
28682,zyster99,16
28683,zywiecporter,9
28684,zziemelis,5


## <span style="color:#ff5f27;"> 🪄 Creating Feature Groups in Hopsworks </span>

In [11]:
# intizialize Hopworks
import hopsworks as hs
project = hs.login()
fs = project.get_feature_store()

  from .autonotebook import tqdm as notebook_tqdm


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/550037




Connected. Call `.close()` to terminate connection gracefully.


In [30]:
import re

# Create a mapping dictionary to replace "/" with "_" for hopsworks compatibility
column_mapping = {col: col.replace("/", "_") for col in data.columns}

# Rename the columns using the mapping dictionary
data.rename(columns=column_mapping, inplace=True)

# Convert all feature names to lowercase
data.columns = data.columns.str.lower()

# Replace spaces with underscores
data.columns = data.columns.str.replace(' ', '_')

# Rename the columns
data.rename(columns=lambda x: re.sub(r'[^a-zA-Z0-9]', '_', x), inplace=True)

# Assuming beer_features is your DataFrame containing the features

# Replace invalid characters with underscores
data.columns = data.columns.str.replace('[^a-zA-Z0-9_]', '_')

# Ensure feature names length does not exceed 63 characters
data.columns = [col[:63] for col in data.columns]



In [31]:
# Collect columns for beer features, including one-hot encoded styles
beer_columns = ['beer_beerid', 'beer_name', 'beer_brewerid', 'beer_abv'] + [col for col in data.columns if col.startswith('style_')]
beer_features = data[beer_columns].drop_duplicates(subset=['beer_beerid']).dropna(subset=['beer_beerid'])

# Ensure all relevant columns are in the correct data type
beer_features['beer_beerid'] = pd.to_numeric(beer_features['beer_beerid'], errors='coerce').astype('Int64')
beer_features['beer_brewerid'] = pd.to_numeric(beer_features['beer_brewerid'], errors='coerce').astype('Int64')
beer_features['beer_abv'] = pd.to_numeric(beer_features['beer_abv'], errors='coerce').astype(float)

# Feature group for Beers
beer_fg = fs.get_or_create_feature_group(
    name="beer_features",
    version=1,
    description="Basic information about beers, including one-hot encoded styles",
    primary_key=['beer_beerid'],
    online_enabled=True
)

# Inserting the data into the feature group
beer_fg.insert(beer_features, write_options={"wait_for_job" : False})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fg/782705


Uploading Dataframe: 100.00% |██████████| Rows 87510/87510 | Elapsed Time: 00:23 | Remaining Time: 00:00


Launching job: beer_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/beer_features_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x164521640>, None)

In [33]:
# update beer feature descriptions
beer_feature_descriptions = {
    "beer_beerid": "Unique identifier for each beer.",
    "beer_name": "Name of the beer.",
    "beer_brewerid": "Identifier for the brewer of the beer.",
    "beer_abv": "Alcohol by volume percentage of the beer.",
}

# Update descriptions for Beer Features
for feature, description in beer_feature_descriptions.items():
    beer_fg.update_feature_description(feature, description)

In [35]:
# Feature Group for Reviews
review_columns = ['review_appearance', 'review_aroma', 'review_palate', 'review_taste', 'review_overall', 'beer_beerid', 'review_profilename', 'review_time']
review_features = data[review_columns]

# Convert data types
review_features['beer_beerid'] = pd.to_numeric(review_features['beer_beerid'], errors='coerce').astype('Int64')
review_features['review_time'] = pd.to_datetime(review_features['review_time'], unit='s')  # Assuming Unix timestamp

# Handle non-numeric data for numeric columns
review_features['review_appearance'] = pd.to_numeric(review_features['review_appearance'], errors='coerce').astype(float)


review_fg = fs.get_or_create_feature_group(
    name="review_features",
    version=1,
    description="Metrics about beer reviews, including user profiles",
    primary_key=['beer_beerid', 'review_profilename'],  # Composite key if review_profilename is used as user identifier
    online_enabled=True
)

review_fg.insert(review_features, write_options={"wait_for_job" : False})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fg/783717


Uploading Dataframe: 100.00% |██████████| Rows 2780796/2780796 | Elapsed Time: 04:04 | Remaining Time: 00:00


Launching job: review_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/review_features_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x105ef18b0>, None)

In [39]:
# update review feature descriptions
review_feature_descriptions = {
    "review_appearance": "Rating of the beer's appearance (1-10).",
    "review_aroma": "Rating of the beer's aroma (1-10).",
    "review_palate": "Rating of the beer's palate (1-10).",
    "review_taste": "Rating of the beer's taste (1-10).",
    "review_overall": "Overall rating of the beer (1-10).",
    "beer_beerid": "Associated unique identifier for each beer.",
    "review_profilename": "Username of the reviewer.",
    "review_time": "Timestamp of the review."
}

# Update descriptions for Review Features
for feature, description in review_feature_descriptions.items():
    review_fg.update_feature_description(feature, description)

In [41]:
# Create the feature group for aggregated beer reviews
agg_reviews_fg = fs.get_or_create_feature_group(
    name="agg_reviews",
    version=1,
    description="Aggregated review metrics for each beer",
    primary_key=['beer_beerid'],
    online_enabled=True
)

# Ensuring 'beer_beerid' and other relevant fields are integers and not strings
agg_reviews['beer_beerid'] = pd.to_numeric(agg_reviews['beer_beerid'], errors='coerce').astype('Int64')

# Ensure any other relevant fields are also correctly typed
agg_reviews['avg_overall'] = pd.to_numeric(agg_reviews['avg_overall'], errors='coerce').astype(float)
agg_reviews['avg_aroma'] = pd.to_numeric(agg_reviews['avg_aroma'], errors='coerce').astype(float)
agg_reviews['avg_palate'] = pd.to_numeric(agg_reviews['avg_palate'], errors='coerce').astype(float)
agg_reviews['avg_taste'] = pd.to_numeric(agg_reviews['avg_taste'], errors='coerce').astype(float)
agg_reviews['avg_appearance'] = pd.to_numeric(agg_reviews['avg_appearance'], errors='coerce').astype(float)

# Insert the data into the feature group
agg_reviews_fg.insert(agg_reviews, write_options={"wait_for_job" : False})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fg/783718


Uploading Dataframe: 100.00% |██████████| Rows 87510/87510 | Elapsed Time: 00:13 | Remaining Time: 00:00


Launching job: agg_reviews_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/agg_reviews_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x161e610d0>, None)

In [42]:
# Update feature descriptions
agg_reviews_feature_descriptions = {
    "beer_beerid": "Unique identifier for each beer.",
    "avg_overall": "Average overall rating of the beer.",
    "review_count": "Total number of reviews for the beer.",
    "avg_aroma": "Average aroma rating for the beer.",
    "avg_palate": "Average palate rating for the beer.",
    "avg_taste": "Average taste rating for the beer.",
    "avg_appearance": "Average appearance rating for the beer."
}

for feature, description in agg_reviews_feature_descriptions.items():
    agg_reviews_fg.update_feature_description(feature, description)

In [43]:
# Create the feature group for reviewer metrics
reviewer_metrics_fg = fs.get_or_create_feature_group(
    name="reviewer_metrics",
    version=1,
    description="Count of reviews submitted by each reviewer",
    primary_key=['review_profilename'],
    online_enabled=True
)

# Insert the data into the feature group
reviewer_metrics_fg.insert(reviewer_metrics)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fg/784731


Uploading Dataframe: 100.00% |██████████| Rows 28686/28686 | Elapsed Time: 00:07 | Remaining Time: 00:00


Launching job: reviewer_metrics_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/reviewer_metrics_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x161e5d100>, None)

In [44]:
# Update feature descriptions
reviewer_metrics_feature_descriptions = {
    "review_profilename": "Username of the reviewer.",
    "review_count": "Total number of reviews submitted by the reviewer."
}

for feature, description in reviewer_metrics_feature_descriptions.items():
    reviewer_metrics_fg.update_feature_description(feature, description)