# Batch prediction pipline for the model trained in the previous notebook

In [None]:
import pickle

# Load the DataFrame
with open('ratebeer.pkl', 'rb') as file:
    df = pickle.load(file)

In [None]:
# Select a subset of the original dataset
subset_dataset = df.sample(n=1000, random_state=42)  # Adjust the size as needed

In [None]:
# Pickle the subset of data
with open("subset_data.pkl", "wb") as file:
    pickle.dump(subset_dataset, file)


In [None]:
import hopsworks

# Log in to Hopsworks
project = hopsworks.login()

# Get the model registry instance
mr = project.get_model_registry()

# Get the model you want to download
model = mr.get_model("BeerRatingPredictionModel1", version=1)

# Download the model artifact
model.download()

In [None]:
import hsml

# connect with Hopsworks.
# NOTE: to connect from an external environment, see https://docs.hopsworks.ai/3.7/user_guides/projects/api_key/create_api_key 
conn = hsml.connection()

# get Hopsworks Model Serving
ms = conn.get_model_serving()

# get deployment object
deployment = ms.get_deployment("beerratingmodeldeployment1")


In [None]:
# get Hopsworks Model Registry
mr = conn.get_model_registry(project="skris20")
  
# get model
model = mr.get_model(deployment.model_name, deployment.model_version)

# make predictions
predictions = deployment.predict({"instances": [model.input_example]})
print(predictions)


In [None]:
# Print the DataFrame structure to verify the columns
print(subset_data.dtypes)

# Check if there are exactly 12 columns
if len(subset_data.columns) != 12:
    print(f"Error: The DataFrame should contain 12 columns, but it has {len(subset_data.columns)} columns.")
else:
    print("The DataFrame has the correct number of columns.")


In [None]:
# Assuming these are the expected columns and types, adjust according to your model's training
expected_columns = {
    'feature1': 'float64', 
    'feature2': 'float64', 
    'feature3': 'int64',
    # Add all 12 expected features and their types here
}

# Check for missing or extra columns
missing_columns = set(expected_columns.keys()) - set(subset_data.columns)
extra_columns = set(subset_data.columns) - set(expected_columns.keys())

if missing_columns:
    print(f"Missing columns: {missing_columns}")
if extra_columns:
    print(f"Extra columns: {extra_columns}")

# Verify data types
for col, dtype in expected_columns.items():
    if col in subset_data.columns and subset_data[col].dtype != dtype:
        print(f"Data type mismatch in column {col}: Expected {dtype}, found {subset_data[col].dtype}")


In [None]:
import pandas as pd
import numpy as np
import pickle
import hsml

# Initialize connection to Hopsworks
conn = hsml.connection()

# Get Hopsworks Model Serving
ms = conn.get_model_serving()

# Get deployment object
deployment = ms.get_deployment("beerratingmodeldeployment1")

# Load the data that has been preprocessed and pickled
with open('ratebeer.pkl', 'rb') as file:
    data = pickle.load(file)

# Verify columns and perform necessary aggregations if missing
if 'avg_overall' not in data.columns:
    # Example of performing aggregation if not already done
    agg_data = data.groupby('beer_beerid').agg({
        'review_overall': ['mean', 'count'],
        'review_aroma': 'mean',
        'review_palate': 'mean',
        'review_taste': 'mean',
        'review_appearance': 'mean'
    }).reset_index()
    agg_data.columns = ['beer_beerid', 'avg_overall', 'review_count', 'avg_aroma', 'avg_palate', 'avg_taste', 'avg_appearance']
    data = data.merge(agg_data, on='beer_beerid')

# Ensure correct feature count and types
expected_columns = ['beer_abv', 'beer_brewerid', 'review_appearance', 'review_aroma', 
                    'review_palate', 'review_taste', 'avg_overall', 'avg_aroma', 
                    'avg_palate', 'avg_taste', 'avg_appearance', 'review_count']
data = data[expected_columns]

# Convert any datetime columns to string format if not already done
if 'review_time' in data.columns:
    data['review_time'] = data['review_time'].dt.strftime('%Y-%m-%dT%H:%M:%S')

# Convert DataFrame to list of dictionaries for prediction
input_data = data.to_dict(orient='records')

# Check each instance has exactly 12 features
if all(len(item) == 12 for item in input_data):
    try:
        predictions = deployment.predict({"instances": input_data})
        print("Predictions received:", predictions)
    except Exception as e:
        print("Error during prediction:", e)
else:
    print("Data formatting error: Each instance does not have exactly 12 features.")
