In [None]:
## Installation
%pip install fastapi uvicorn    # API
%pip install numpy              # Efficient data handling
%pip install python-dotenv      # .env => extracting hidden info
%pip install requests           # Performing API calls

## Imports
from fastapi import FastAPI, HTTPException
import numpy as np
from dotenv import load_dotenv
import os
import requests
from enum import Enum

In [None]:
## Configuration
load_dotenv()

## Global Variables
BACKEND_URL = os.getenv("BACKEND_URL")
class ConsumerField(Enum):
    VISITS = 1
    ORDERS = 2
    FAVORITED = 3
    FRIENDS = 4
    ALL = 5

categories = [ # Based off of prisma schema categories enum
    "asian",
    "bakery",
    "barfood",
    "bbq",
    "breakfast",
    "burgers",
    "cafe",
    "chinese",
    "desserts",
    "fastfood",
    "french",
    "greek",
    "healthy",
    "indian",
    "italian",
    "japanese",
    "korean",
    "latinamerican",
    "mediterranean",
    "mexican",
    "middleeastern",
    "pizza",
    "salads",
    "sandwiches",
    "seafood",
    "sushi",
    "thai",
    "vegan",
    "vegetarian",
    "vietnamese",
]

In [None]:
app = FastAPI()

In [None]:
## Pull consumer information from the DB
def get_consumer_info(consumer_id, is_full_address_provided, street_address, city, postal_code, state, country, field):


    # Extract desired information
    path_route = ''
    match field:
        case ConsumerField.VISITS:
            path = 'visit'
        case ConsumerField.ORDERS:
            path = 'order'
        case ConsumerField.FAVORITED:
            path = 'favorited'
        case _: # default case
            raise HTTPException(status_code=500, detail="Passed in invalid consumer info field")
        
    # TODO: Fix (add consumer_info & req.query)
    response = {}
    if is_full_address_provided:
        params = {
             "street_address": street_address,
            "city": city,
            "postal_code": postal_code,
            "state": state,
            "country": country,
        }
        response = requests.get(f"{BACKEND_URL}/{path_route}/{consumer_id}", params={params})
    else:
        response = requests.get(f"{BACKEND_URL}/{path_route}/{consumer_id}")
    if not response.ok:
        raise HTTPException(status_code=424, detail="Failed to pull consumer information from DB")
    selected_consumer_restaurant_data = response.json()
    return selected_consumer_restaurant_data

## Pull restaurant information from the DB
def get_restaurant_info():
    response = requests.get(f"{BACKEND_URL}/restaurant")
    if not response.ok:
        raise HTTPException(status_code=424, detail="Failed to pull restaurant information from DB")
    restaurants_data = response.json()
    return restaurants_data

In [None]:
## Features & Default Weights

# Feature Weights
rating_weight = 0.3; avg_cost_weight = 0.2; distance_weight = 0.125
category_budget = 1 - rating_weight - avg_cost_weight - distance_weight
num_categories = len(categories)
category_weight = category_budget / num_categories

feature_weights = {
    'rating': rating_weight,
    'avg_cost': avg_cost_weight,
    'distance': distance_weight,
}
for category in categories: # Add in category feature weights
    feature_weights[category] = category_weight

# Existing Restaurant Interaction Weights
# NOTE: Used to define how to weigh restaurant's to generate consumer vector
# TODO: Edit later?
restaurant_type_weights = {
    'survey': 0.6,
    'favorited': 0.8,
    'order': 0.4,
    'visit': 0.2,
}

# Consumer Vector Feedback/Interaction Weights
# NOTE: Used to define feedback on updating consumer vector based on interactions with recommendations
# TODO: Edit later?
recommendation_feedback_weights = {
    'favorited': 0.2,
    'order': 0.1,
    'visit': 0.04,
}

In [None]:
## Utility Functions

# Winsorization is used to solve min-max normalization isues caused by outliers by taking, say, the 1st percentile & 99th percentile rather than min & max
# Resource: https://medium.com/@whyamit404/implementing-pandas-winsorize-ad1e51ec548b
def winsorization(arr):
    np_arr = np.array(arr)

    winsorized_min = np.percentile(np_arr, 1)
    winsorized_max = np.percentile(np_arr, 99)

    return [winsorized_min, winsorized_max]

# Normalization takes a vector and makes it a unit vector (magnitude one)
# Resource: https://www.khanacademy.org/computing/computer-programming/programming-natural-simulations/programming-vectors/a/vector-magnitude-normalization
def normalization(column_vector):
    squared_sum = 0
    for i in range(column_vector.shape[0]):
        squared_sum += column_vector[i]**2
        
    norm = np.sqrt(squared_sum)
    
    for i in range(column_vector.shape[0]):
        column_vector[i] = column_vector[i] / norm
        
    return column_vector
    

# Get parameters required to generate restaurant embeddings
def get_restaurant_embedding_params(restaurants_data):
    # Find average values (for default & normalization)
    ratings_arr = [restaurant['avg_rating'] for restaurant in restaurants_data] # NOTE: Array composition method inspired by https://stackoverflow.com/questions/50216362/how-to-extract-from-a-json-array-in-python
    rating_sum = sum(rating for rating in ratings_arr if (rating != -1)) # Only use ratings from restaurants w/ existing ratings
    num_ratings = sum(1 for rating in ratings_arr if (rating != -1)) # Only count ratings from restaurants w/ existing ratings
    avg_rating =  rating_sum / num_ratings 

    cost_arr = [restaurant['avg_cost'] for restaurant in restaurants_data]
    min_winsorized_cost, max_winsorized_cost = winsorization(cost_arr)

    # NOTE: Distance may not yet be populated if user hasn't entered the address they're ordering from (this approach is taken to reduce Google Maps API exhaustion)
    distance_meters_arr = [restaurant['distance_value'] for restaurant in restaurants_data]
    is_distance_field_valid = distance_meters_arr[0] is not None
    avg_distance_meters = 0.5
    min_winsorized_distance, max_winsorized_distance = None
    if is_distance_field_valid:
        avg_distance_meters = sum(distance_meters_arr) / len(distance_meters_arr)
        min_winsorized_distance, max_winsorized_distance = winsorization(distance_meters_arr)

    return [is_distance_field_valid, min_winsorized_cost, max_winsorized_cost, min_winsorized_distance, max_winsorized_distance, avg_rating, avg_distance_meters]

# Generate Vector/Embedding (Shared between Consumer & Restaurant Vector Operations)
def generate_restaurant_embedding(restaurant, is_distance_field_valid, min_winsorized_cost, max_winsorized_cost, min_winsorized_distance, max_winsorized_distance, avg_rating, avg_distance_meters):
    # Normalizing Values (NOTE: Cost & Distance are Min-Max Normalized w/ Inversion (lower cost/distance is more favorable))
    rating_normalized = (restaurant['avg_rating'] if (restaurant['avg_rating'] != -1 ) else avg_rating) / 5.0
    cost_normalized_inverted = 1 - ( ( restaurant['avg_cost'] - min_winsorized_cost ) / ( max_winsorized_cost - min_winsorized_cost ) ) 
    distance_normalized_inverted = avg_distance_meters
    if is_distance_field_valid:
        distance_normalized_inverted = 1 - ( ( restaurant['distance_value'] - min_winsorized_distance ) / ( max_winsorized_distance - min_winsorized_distance ) )
    
    restaurant_vector = np.array([
        rating_normalized,
        cost_normalized_inverted,
        distance_normalized_inverted,
    ])
    # Add in category weights
    for category in categories:
        if(category in restaurant['categories']):
            restaurant_vector = np.append(restaurant_vector, 1)
        else: 
            restaurant_vector = np.append(restaurant_vector, 0)

    return restaurant_vector

# Uses embedding parameters to generate a dictionary of embeddings (whose keys are restaurant_id's))
def generate_embeddings_dict(restaurants_data, is_distance_field_valid, min_winsorized_cost, max_winsorized_cost, min_winsorized_distance, max_winsorized_distance, avg_rating, avg_distance_meters):
    restaurant_vectors_dict = {} # Restaurant ID to vector mappings
    for restaurant in restaurants_data:
        restaurant_vector = generate_restaurant_embedding(restaurant, is_distance_field_valid, min_winsorized_cost, max_winsorized_cost, min_winsorized_distance, max_winsorized_distance, avg_rating, avg_distance_meters)
        restaurant_vectors_dict[restaurant['restaurant_id']] = restaurant_vector
    return restaurant_vectors_dict

# Get the mean vector of a set of vectors
# NOTE: For column vectors, resultant mean is a column vector where each entry is the mean of the entries of that index across the vectors
def get_mean_vector(vectors):
    mean_vector = np.mean(vectors, axis=1)
    return mean_vector

In [None]:
## Restaurant Vectors Generation
def generate_restaurant_vectors():
    restaurants_data = get_restaurant_info()
    is_distance_field_valid, min_winsorized_cost, max_winsorized_cost, min_winsorized_distance, max_winsorized_distance, avg_rating, avg_distance_meters = get_restaurant_embedding_params(restaurants_data)
    restaurant_vectors_dict = generate_embeddings_dict(restaurants_data, is_distance_field_valid, min_winsorized_cost, max_winsorized_cost, min_winsorized_distance, max_winsorized_distance, avg_rating, avg_distance_meters)
    
    # Normalize each restaurant vector
    for key, vector in restaurant_vectors_dict.items():
        normalized_vector = normalization(vector)
        restaurant_vectors_dict[key] = normalized_vector
        
    return restaurant_vectors_dict

In [None]:
## Consumer Vector Operations

# Generate consumer vector if it doesn't previously exist
def generate_init_consumer_vector(consumer_id, is_full_address_provided, street_address, city, postal_code, state, country):
    # Embedding parameters determined at the level of all restaurants (to have consistent averages, min/max, etc)
    restaurants_data = get_restaurant_info()
    is_distance_field_valid, min_winsorized_cost, max_winsorized_cost, min_winsorized_distance, max_winsorized_distance, avg_rating, avg_distance_meters = get_restaurant_embedding_params(restaurants_data)
    
    # Find mean embedding among all restaurants to fill in for missing site_visit/orders/favorited embeddings
    # NOTE: Approach for collecting embeddings dict values into numpy matrix of column vectors borrowed largely from: https://stackoverflow.com/questions/60493932/how-to-combine-column-vectors-into-a-matrix
    restaurant_vectors_dict = generate_restaurant_vectors()
    restaurant_columns_matrix = np.column_stack([restaurant_vectors_dict[key] for key in restaurant_vectors_dict.keys()])
    mean_restaurant_vector = get_mean_vector(restaurant_columns_matrix)
    
    # Retrieve consumer restaurant information
    restaurant_site_visits = get_consumer_info(consumer_id, is_full_address_provided, street_address, city, postal_code, state, country, ConsumerField.VISITS)
    restaurant_orders = get_consumer_info(consumer_id, is_full_address_provided, street_address, city, postal_code, state, country, ConsumerField.ORDERS)
    restaurant_favorited = get_consumer_info(consumer_id, is_full_address_provided, street_address, city, postal_code, state, country, ConsumerField.FAVORITED)
    
    # Check consumer restaurant information for emptiness
    is_visits_valid = (len(restaurant_site_visits) != 0)
    is_orders_valid = (len(restaurant_orders) != 0)
    is_favorited_valid = (len(restaurant_favorited) != 0)
    
    # Generate site visits embeddings (if non-empty)
    site_visit_restaurant_vectors_dict = {}
    order_restaurant_vectors_dict = {}
    favorite_restaurant_vectors_dict = {}
    if is_visits_valid:
        site_visit_restaurant_vectors_dict = generate_embeddings_dict(restaurant_site_visits, is_distance_field_valid, min_winsorized_cost, max_winsorized_cost, min_winsorized_distance, max_winsorized_distance, avg_rating, avg_distance_meters)
    if is_orders_valid:
        order_restaurant_vectors_dict = generate_embeddings_dict(restaurant_orders, is_distance_field_valid, min_winsorized_cost, max_winsorized_cost, min_winsorized_distance, max_winsorized_distance, avg_rating, avg_distance_meters)
    if is_favorited_valid:
        favorite_restaurant_vectors_dict = generate_embeddings_dict(restaurant_favorited, is_distance_field_valid, min_winsorized_cost, max_winsorized_cost, min_winsorized_distance, max_winsorized_distance, avg_rating, avg_distance_meters)

    # Get mean vectors (or set mean vector to global mean vector if data was empty)
    site_visit_mean_vector = mean_restaurant_vector
    order_mean_vector = mean_restaurant_vector
    favorite_mean_vector = mean_restaurant_vector
    if is_visits_valid: 
        site_visit_columns_matrix = np.column_stack([site_visit_restaurant_vectors_dict[key] for key in site_visit_restaurant_vectors_dict.keys()])
        site_visit_mean_vector = get_mean_vector(site_visit_columns_matrix)
    if is_orders_valid:
        order_columns_matrix = np.column_stack([order_restaurant_vectors_dict[key] for key in order_restaurant_vectors_dict.keys()])
        order_mean_vector = get_mean_vector(order_columns_matrix)
    if is_favorited_valid:
        favorite_columns_matrix = np.column_stack([favorite_restaurant_vectors_dict[key] for key in favorite_restaurant_vectors_dict.keys()])
        favorite_mean_vector = get_mean_vector(favorite_columns_matrix)

    # Get consumer vector (weighted average of mean vectors)
    init_consumer_vector = (restaurant_type_weights['visit'] * site_visit_mean_vector +
                            restaurant_type_weights['order'] * order_mean_vector +
                            restaurant_type_weights['favorited'] * favorite_mean_vector)

    # Normalize consumer vector
    normalized_init_consumer_vector = normalization(init_consumer_vector)

    return normalized_init_consumer_vector

# Update consumer vector based on user interactions with recommendations
def update_consumer_vector():
    # TODO: 

In [None]:
## Cosine similarity
# TODO: 

In [None]:
## Return top-N recommendations
# TODO: 

In [None]:
@app.get("/recommend/{consumer_id}")
def recommend(consumer_id: int, street_address: str | None = None, city: str | None = None, postal_code: str | None = None, state: str | None = None, country: str | None = None):
    # TODO: check this
    is_address_provided = any(address_arg is not None in [street_address, city, postal_code, state, country])
    is_full_address_provided = all(address_arg is not None in [street_address, city, postal_code, state, country])
    if is_address_provided and  not is_full_address_provided:
        raise HTTPException(status_code=400, detail="Missing some address fields")
        
    
    # TODO: Recommendation code to be called here
    # TODO: Add consumer_id & address arguments & is_full_address_provided as argument that propagates down to generate_init_consumer_vector