## <span style="color:#ff5f27;"> 💽 Loading the Data </span>

In [1]:
#!pip install -U hopsworks --quiet

In [2]:
#!pip install kaggle --quiet

In [1]:
import numpy as np
import pandas as pd
import gdown

In [2]:
# URL from Google Drive shareable link
url = 'https://drive.google.com/uc?id=1LeoFhTNUZiZ9sbn31jZA6DGN3p36Dsvk'

# Filename
output = 'ratebeer.txt'

# Download the file from the URL
gdown.download(url, output, quiet=False)

# Define the file path to the downloaded file
file_path = 'ratebeer.txt'

Downloading...
From (original): https://drive.google.com/uc?id=1LeoFhTNUZiZ9sbn31jZA6DGN3p36Dsvk
From (redirected): https://drive.google.com/uc?id=1LeoFhTNUZiZ9sbn31jZA6DGN3p36Dsvk&confirm=t&uuid=2b09b2df-41cf-4e89-8301-b9b1d7e71c8f
To: /Users/lassehylleberg/Desktop/Data-Engineering-MLOps-Exam-Assignment/Data-Engineering-MLOps-Exam-Assignment/ratebeer.txt
100%|██████████| 1.74G/1.74G [01:13<00:00, 23.6MB/s]


In [3]:
# Define columns
columns = ['beer/name', 'beer/beerId', 'beer/brewerId', 'beer/ABV', 'beer/style',
           'review/appearance', 'review/aroma', 'review/palate', 'review/taste',
           'review/overall', 'review/time', 'review/profileName', 'review/text']
data = {col: [] for col in columns}

# Read the file
with open(file_path, "r", encoding="ISO-8859-1") as file:
    current_review = {}
    for line in file:
        line = line.strip()
        if line:
            try:
                key, value = line.split(": ", 1)
                current_review[key] = value
            except ValueError:
                continue
        else:
            if current_review:
                for col in columns:
                    data[col].append(current_review.get(col, np.nan))
                current_review = {}

# Check for any remaining entries to add
if current_review:
    for col in columns:
        data[col].append(current_review.get(col, np.nan))

# Create a DataFrame from the data
df = pd.DataFrame(data)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2924163 entries, 0 to 2924162
Data columns (total 13 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   beer/name           object
 1   beer/beerId         object
 2   beer/brewerId       object
 3   beer/ABV            object
 4   beer/style          object
 5   review/appearance   object
 6   review/aroma        object
 7   review/palate       object
 8   review/taste        object
 9   review/overall      object
 10  review/time         object
 11  review/profileName  object
 12  review/text         object
dtypes: object(13)
memory usage: 290.0+ MB


In [5]:
# Rename columns for Hopsworks compatibility
df.rename(columns={'beer/beerId': 'beer/beerid', 'beer/brewerId': 'beer/brewerid', 'beer/ABV': 'beer/abv', 'review/profileName': 'review/profilename'}, inplace=True)

In [6]:
# Replace / with _ in column names for Hopsworks compatibility
new_columns = {col: col.replace('/', '_').replace('beer/', 'beer_') for col in df.columns}
df.rename(columns=new_columns, inplace=True)

## <span style="color:#ff5f27;"> 🛠️ Feature Engineering </span>

In [7]:
# Convert numeric columns to float
numeric_cols = ['beer_abv', 'review_appearance', 'review_aroma', 'review_palate', 'review_taste', 'review_overall']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Convert ID columns to integers, use pd.to_numeric for coercion
id_cols = ['beer_beerid', 'beer_brewerid']
for col in id_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')

# Convert review time to datetime
df['review_time'] = pd.to_datetime(df['review_time'], errors='coerce')

# Check the changes to verify that the types are as expected
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2924163 entries, 0 to 2924162
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   beer_name           object        
 1   beer_beerid         float64       
 2   beer_brewerid       int16         
 3   beer_abv            float64       
 4   beer_style          object        
 5   review_appearance   float64       
 6   review_aroma        float64       
 7   review_palate       float64       
 8   review_taste        float64       
 9   review_overall      float64       
 10  review_time         datetime64[ns]
 11  review_profilename  object        
 12  review_text         object        
dtypes: datetime64[ns](1), float64(7), int16(1), object(4)
memory usage: 273.3+ MB


## <span style="color:#ff5f27;"> 🪄 Creating Feature Groups in Hopsworks </span>

In [8]:
import hopsworks as hs
project = hs.login()
fs = project.get_feature_store()

  from .autonotebook import tqdm as notebook_tqdm


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/550037




Connected. Call `.close()` to terminate connection gracefully.


In [14]:
# Ensure all ID columns are integers
df['beer_beerid'] = pd.to_numeric(df['beer_beerid'], errors='coerce').astype('Int64')

# Drop any rows where the primary key is NaN
beer_features = df[['beer_beerid', 'beer_name', 'beer_brewerid', 'beer_abv', 'beer_style']].drop_duplicates(subset=['beer_beerid'])
beer_features = beer_features.dropna(subset=['beer_beerid'])

# Feature group for Beers
beer_fg = fs.get_or_create_feature_group(
    name="beer_features",
    version=1,
    description="Basic information about beers",
    primary_key=['beer_beerid'],
    online_enabled=True
)

# Inserting the data
beer_fg.insert(beer_features)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fg/772468


Uploading Dataframe: 100.00% |██████████| Rows 110364/110364 | Elapsed Time: 00:15 | Remaining Time: 00:00


Launching job: beer_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/beer_features_1_offline_fg_materialization/executions


In [17]:
# update beer feature descriptions
beer_feature_descriptions = {
    "beer_beerid": "Unique identifier for each beer.",
    "beer_name": "Name of the beer.",
    "beer_brewerid": "Identifier for the brewer of the beer.",
    "beer_abv": "Alcohol by volume percentage of the beer.",
    "beer_style": "Style or category of the beer."
}

# Update descriptions for Beer Features
for feature, description in beer_feature_descriptions.items():
    beer_fg.update_feature_description(feature, description)


In [15]:
# Feature Group for Reviews
review_features = df[['review_appearance', 'review_aroma', 'review_palate', 'review_taste', 'review_overall', 'beer_beerid']]
review_fg = fs.get_or_create_feature_group(
    name="review_features",
    version=1,
    description="Metrics about beer reviews",
    primary_key=['beer_beerid'],
    online_enabled=True
)
review_fg.insert(review_features)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fg/774499


Uploading Dataframe: 100.00% |██████████| Rows 2924163/2924163 | Elapsed Time: 03:32 | Remaining Time: 00:00


Launching job: review_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/review_features_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x1d5843b90>, None)

In [18]:
# update review feature descriptions
review_feature_descriptions = {
    "review_appearance": "Rating of the beer's appearance (1-5).",
    "review_aroma": "Rating of the beer's aroma (1-5).",
    "review_palate": "Rating of the beer's palate (1-5).",
    "review_taste": "Rating of the beer's taste (1-5).",
    "review_overall": "Overall rating of the beer (1-5).",
    "beer_beerid": "Associated unique identifier for each beer."
}

# Update descriptions for Review Features
for feature, description in review_feature_descriptions.items():
    review_fg.update_feature_description(feature, description)

In [16]:
# Feature Group for reviewer profiles
reviewer_profile_features = df[['review_profilename']].drop_duplicates()
reviewer_fg = fs.get_or_create_feature_group(
    name="reviewer_profile_features",
    version=1,
    description="Profile information of reviewers",
    primary_key=['review_profilename'],
    online_enabled=True
)
reviewer_fg.insert(reviewer_profile_features)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/550037/fs/545860/fg/775517


Uploading Dataframe: 100.00% |██████████| Rows 29265/29265 | Elapsed Time: 00:07 | Remaining Time: 00:00


Launching job: reviewer_profile_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/550037/jobs/named/reviewer_profile_features_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x1d5841520>, None)

In [19]:
# update reviewer profile feature descriptions
reviewer_profile_feature_descriptions = {
    "review_profilename": "Unique username of the reviewer."
}

# Update descriptions for Reviewer Profile Features
for feature, description in reviewer_profile_feature_descriptions.items():
    reviewer_fg.update_feature_description(feature, description)