## <span style="color:#ff5f27;"> 💽 Loading the Data </span>

In [1]:
!pip install -U hopsworks --quiet

In [2]:
!pip install kaggle --quiet

In [8]:
import numpy as np
import pandas as pd
import gdown

In [9]:
# URL from Google Drive shareable link
url = 'https://drive.google.com/uc?id=1LeoFhTNUZiZ9sbn31jZA6DGN3p36Dsvk'

# Filename
output = 'ratebeer.txt'

# Download the file from the URL
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1LeoFhTNUZiZ9sbn31jZA6DGN3p36Dsvk
From (redirected): https://drive.google.com/uc?id=1LeoFhTNUZiZ9sbn31jZA6DGN3p36Dsvk&confirm=t&uuid=3b1d598e-9dcf-4d46-af46-0e5b292ef319
To: /Users/lassehylleberg/Desktop/Data-Engineering-MLOps-Exam-Assignment/ratebeer.txt
100%|██████████| 1.74G/1.74G [01:18<00:00, 22.1MB/s]


'ratebeer.txt'

In [10]:
# Define the file path to the downloaded file
file_path = 'ratebeer.txt'

# Define columns as you have mentioned previously
columns = ['beer/name', 'beer/beerId', 'beer/brewerId', 'beer/ABV', 'beer/style',
           'review/appearance', 'review/aroma', 'review/palate', 'review/taste',
           'review/overall', 'review/time', 'review/profileName', 'review/text']
data = {col: [] for col in columns}

# Read the file
with open(file_path, "r", encoding="ISO-8859-1") as file:
    current_review = {}
    for line in file:
        line = line.strip()
        if line:
            try:
                key, value = line.split(": ", 1)
                current_review[key] = value
            except ValueError:
                continue
        else:
            if current_review:
                for col in columns:
                    data[col].append(current_review.get(col, np.nan))
                current_review = {}

# Check for any remaining entries to add
if current_review:
    for col in columns:
        data[col].append(current_review.get(col, np.nan))

# Create a DataFrame from the data
df = pd.DataFrame(data)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2924163 entries, 0 to 2924162
Data columns (total 13 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   beer/name           object
 1   beer/beerId         object
 2   beer/brewerId       object
 3   beer/ABV            object
 4   beer/style          object
 5   review/appearance   object
 6   review/aroma        object
 7   review/palate       object
 8   review/taste        object
 9   review/overall      object
 10  review/time         object
 11  review/profileName  object
 12  review/text         object
dtypes: object(13)
memory usage: 290.0+ MB


In [12]:
# Convert numeric columns to float
numeric_cols = ['beer/ABV', 'review/appearance', 'review/aroma', 'review/palate', 'review/taste', 'review/overall']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Convert ID columns to integers, coerce errors which will convert non-convertible values to NaN
id_cols = ['beer/beerId', 'beer/brewerId']
df[id_cols] = df[id_cols].apply(pd.to_numeric, errors='coerce', downcast='integer')

# Convert review time to datetime
df['review/time'] = pd.to_datetime(df['review/time'], errors='coerce')

# Check the changes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2924163 entries, 0 to 2924162
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   beer/name           object        
 1   beer/beerId         float64       
 2   beer/brewerId       int16         
 3   beer/ABV            float64       
 4   beer/style          object        
 5   review/appearance   float64       
 6   review/aroma        float64       
 7   review/palate       float64       
 8   review/taste        float64       
 9   review/overall      float64       
 10  review/time         datetime64[ns]
 11  review/profileName  object        
 12  review/text         object        
dtypes: datetime64[ns](1), float64(7), int16(1), object(4)
memory usage: 273.3+ MB


## <span style="color:#ff5f27;"> 🛠️ Feature Engineering </span>

In [22]:
import hopsworks as hs

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
project = hs.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/550038
Connected. Call `.close()` to terminate connection gracefully.


In [39]:
print(df.columns)

Index(['beer/name', 'beer/beerId', 'beer/brewerId', 'beer/ABV', 'beer/style',
       'review/appearance', 'review/aroma', 'review/palate', 'review/taste',
       'review/overall', 'review/time', 'review/profileName', 'review/text'],
      dtype='object')


In [54]:
import warnings
from sml import beer_features
warnings.filterwarnings('ignore')

# Assume df is your DataFrame already loaded with the appropriate data
# Use the 'prepare_features' function to apply multiple feature preparations
df = prepare_features(df)

# Calculate average ratings across specified columns
rating_columns = ['review/appearance', 'review/aroma', 'review/palate', 'review/taste']
df = average_ratings(df, rating_columns)

# Count the number of reviews for each beer
review_counts = review_count(df, 'beer/beerId')

# Merge the review counts back to the main DataFrame if needed
df = df.merge(review_counts, on='beer/beerId', how='left')

# Display the modified DataFrame
print(df.head())

NameError: name 'prepare_features' is not defined