In [5]:
# !pip3 install pandas scikit-learn firebase-admin

In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import firebase_admin
from firebase_admin import credentials, firestore
from sklearn.preprocessing import MinMaxScaler

# Data Collection

In [33]:
key = "cm_firebase.json"

In [34]:
# Initialize Firestore
cred = credentials.Certificate(key)
firebase_admin.initialize_app(cred)

# Initialize Firestore client
db = firestore.client()

In [35]:
# Fetch orders collection
def fetch_db(col):
    col_ref = db.collection(col)
    docs = col_ref.stream()

    col_data = []
    for doc in docs:
        col_data.append(doc.to_dict())

    return pd.DataFrame(col_data)  # Convert to DataFrame

In [40]:
# Load data into DataFrames
users_df = fetch_db('users')
orders_df = fetch_db('mock_orders')

In [41]:
# Drop unnecessary columns
users_df.drop(columns=['password'], inplace=True)

# Data Preprocessing & Feature Engineering

users_df

In [44]:
# Extract user's health conditions
# Create a MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Fit and transform the 'healthCondition' column
health_encoded = mlb.fit_transform(users_df['healthCondition'])

# Create a new DataFrame with the one-hot encoded columns
health_df = pd.DataFrame(health_encoded, columns=mlb.classes_, index=users_df.index)
health_df

# Concatenate the one-hot encoded DataFrame with the original DataFrame
users_df = pd.concat([users_df, health_df], axis=1)

# Drop the original 'healthCondition' column (optional)
users_df = users_df.drop(columns=['healthCondition'])

users_df.head()

Unnamed: 0,email,preferences,ageGroup,Diabetes,Gas Reflux,Heart Disease,High Pressure,Lactose Intolerance,Liver Disease,Sleep Disorder
0,user1@example.com,"{'cream': '0', 'sweet': '1', 'caffeine': '2'}",35,0,1,1,0,1,0,0
1,user12@example.com,"{'cream': '1', 'sweet': '3', 'caffeine': '1'}",53,0,0,0,0,1,1,0
2,user15@example.com,"{'cream': '1', 'sweet': '0', 'caffeine': '1'}",29,0,0,0,1,0,0,0
3,user9@example.com,"{'cream': '0', 'sweet': '3', 'caffeine': '3'}",25,0,0,0,1,0,1,0
4,user6@example.com,"{'cream': '1', 'sweet': '2', 'caffeine': '1'}",54,1,0,1,1,0,0,0


In [45]:
users_df.columns

Index(['email', 'preferences', 'ageGroup', 'Diabetes', 'Gas Reflux',
       'Heart Disease', 'High Pressure', 'Lactose Intolerance',
       'Liver Disease', 'Sleep Disorder'],
      dtype='object')

In [46]:
# split prefered ingredient level into its own column
df_expanded = pd.json_normalize(users_df['preferences'])
df_expanded = df_expanded.rename(columns={
    'caffeine': 'preferCaffeine',
    'sweet': 'preferSweet',
    'cream': 'preferCream'
})
users_df = pd.concat([users_df, df_expanded], axis=1)
users_df = users_df.drop(columns=['preferences'])
users_df.head()

Unnamed: 0,email,ageGroup,Diabetes,Gas Reflux,Heart Disease,High Pressure,Lactose Intolerance,Liver Disease,Sleep Disorder,preferCream,preferSweet,preferCaffeine
0,user1@example.com,35,0,1,1,0,1,0,0,0,1,2
1,user12@example.com,53,0,0,0,0,1,1,0,1,3,1
2,user15@example.com,29,0,0,0,1,0,0,0,1,0,1
3,user9@example.com,25,0,0,0,1,0,1,0,0,3,3
4,user6@example.com,54,1,0,1,1,0,0,0,1,2,1


In [47]:
prefer_ingredients = users_df[['email','preferCaffeine', 'preferSweet', 'preferCream']].copy()
minmax = MinMaxScaler()
users_df[['preferCaffeine', 'preferSweet', 'preferCream']] = minmax.fit_transform(users_df[['preferCaffeine', 'preferSweet', 'preferCream']])
users_df.head()

Unnamed: 0,email,ageGroup,Diabetes,Gas Reflux,Heart Disease,High Pressure,Lactose Intolerance,Liver Disease,Sleep Disorder,preferCream,preferSweet,preferCaffeine
0,user1@example.com,35,0,1,1,0,1,0,0,0.0,0.333333,0.5
1,user12@example.com,53,0,0,0,0,1,1,0,1.0,1.0,0.0
2,user15@example.com,29,0,0,0,1,0,0,0,1.0,0.0,0.0
3,user9@example.com,25,0,0,0,1,0,1,0,0.0,1.0,1.0
4,user6@example.com,54,1,0,1,1,0,0,0,1.0,0.666667,0.0


In [48]:
#categorize age into age group
users_df['ageGroup'] = pd.to_numeric(users_df['ageGroup'], errors='coerce') # Convert to numeric, handle errors

users_df['ageGroup'] = pd.cut(users_df['ageGroup'], bins=[17, 30, 40, 50, 60, 100]
                               , labels=['18-30', '31-40', '41-50', '51-60', '61+'])
users_df = pd.get_dummies(users_df, columns=['ageGroup'])
users_df.head()

Unnamed: 0,email,Diabetes,Gas Reflux,Heart Disease,High Pressure,Lactose Intolerance,Liver Disease,Sleep Disorder,preferCream,preferSweet,preferCaffeine,ageGroup_18-30,ageGroup_31-40,ageGroup_41-50,ageGroup_51-60,ageGroup_61+
0,user1@example.com,0,1,1,0,1,0,0,0.0,0.333333,0.5,False,True,False,False,False
1,user12@example.com,0,0,0,0,1,1,0,1.0,1.0,0.0,False,False,False,True,False
2,user15@example.com,0,0,0,1,0,0,0,1.0,0.0,0.0,True,False,False,False,False
3,user9@example.com,0,0,0,1,0,1,0,0.0,1.0,1.0,True,False,False,False,False
4,user6@example.com,1,0,1,1,0,0,0,1.0,0.666667,0.0,False,False,False,True,False


In [50]:
print(list(users_df.columns))

['email', 'Diabetes', 'Gas Reflux', 'Heart Disease', 'High Pressure', 'Lactose Intolerance', 'Liver Disease', 'Sleep Disorder', 'preferCream', 'preferSweet', 'preferCaffeine', 'ageGroup_18-30', 'ageGroup_31-40', 'ageGroup_41-50', 'ageGroup_51-60', 'ageGroup_61+']


*****

orders_df

In [51]:
mock_orders_df = fetch_db('mock_orders')

In [52]:
orders_df = fetch_db('orders')

In [53]:
orders_df = pd.concat([orders_df, mock_orders_df], ignore_index=True)

In [54]:
orders_df = orders_df.drop(columns='water')

In [55]:
#fill in nan with 0
orders_df.fillna(0, inplace=True)
orders_df.head()

Unnamed: 0,menu,milk,shots,email,timestamp,sweetness
0,feifei,1,3,fei@gmail.com,2025-02-08 15:20:10.425000+00:00,100
1,menuee,2,3,ba@na.naa,2025-02-04 05:50:04.869000+00:00,100
2,Cappuccino,1,2,ba@na.naa,2025-02-04 05:45:39.077000+00:00,100
3,Black,0,3,user10@example.com,2025-02-17 09:19:43.156000+00:00,150
4,Americano,0,1,user10@example.com,2025-02-27 10:24:17.303000+00:00,50


In [57]:
#separate time in hour from timestamp and categorize in time group
orders_df['time'] = pd.to_datetime(orders_df['timestamp'], utc=True).dt.hour
orders_df['time_group'] = pd.cut(orders_df['time'], bins=[-1, 6, 12, 18, 24]
                               , labels=['early', 'morning', 'afternoon', 'evening'])
orders_df.drop(columns=['timestamp','time'], inplace=True)
orders_df.head()


Unnamed: 0,menu,milk,shots,email,sweetness,time_group
0,feifei,1,3,fei@gmail.com,100,afternoon
1,menuee,2,3,ba@na.naa,100,early
2,Cappuccino,1,2,ba@na.naa,100,early
3,Black,0,3,user10@example.com,150,morning
4,Americano,0,1,user10@example.com,50,morning


In [58]:
# Create dummy variables for menu items
time_dummies = pd.get_dummies(orders_df["time_group"], prefix="", prefix_sep="")

# Concatenate the dummy variables with email
time_count_per_user = orders_df[["email"]].join(time_dummies)

# Group by email and sum to get order frequency per user
time_count_summary = time_count_per_user.groupby("email").sum()

# Calculate the total count of all menu items per user
total_time_counts = time_count_summary.sum(axis=1)

# Calculate the proportion of each menu count over the total count per user
time_proportion_per_user = time_count_summary.div(total_time_counts, axis=0)

In [59]:
# Scale ingredient levels
scale_map = {0:0, 50: 1, 100: 2, 150: 3, 200:3}
orders_df['sweetness'] = orders_df['sweetness'].astype(int).map(scale_map)
orders_df['shots'] = orders_df['shots'].astype(int)
orders_df['milk'] = orders_df['milk'].astype(int).apply(lambda x: 0 if x == 0 else 1).fillna(0)

orders_df.head()

Unnamed: 0,menu,milk,shots,email,sweetness,time_group
0,feifei,1,3,fei@gmail.com,2,afternoon
1,menuee,1,3,ba@na.naa,2,early
2,Cappuccino,1,2,ba@na.naa,2,early
3,Black,0,3,user10@example.com,3,morning
4,Americano,0,1,user10@example.com,1,morning


In [60]:
# sweetness, shots, milk: average
average_features = orders_df[['email', 'sweetness', 'shots', 'milk']].copy()
average_summary = average_features.groupby('email').mean()
avg_ingredients = average_summary.copy()

minmax = MinMaxScaler()
average_summary[['sweetness', 'shots', 'milk']] = minmax.fit_transform(average_summary[['sweetness', 'shots', 'milk']])
average_summary.head()

Unnamed: 0_level_0,sweetness,shots,milk
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ba@na.naa,0.4,0.625,1.0
eieieie@iid.com,0.8,0.0,1.0
fei@gmail.com,0.15,0.166667,0.433333
pim@mat.com,0.228571,0.714286,0.0
try@gmail.com,0.8,0.0,0.0


In [61]:
# Create dummy variables for menu items
menu_dummies = pd.get_dummies(orders_df["menu"], prefix="menu_", prefix_sep="")

# Define the list of valid menu items
menus = ["menu_Black", "menu_Cappuccino", "menu_Latte", "menu_Americano", "menu_Espresso"]

# Concatenate the dummy variables with email
menu_count_per_user = orders_df[["email"]].join(menu_dummies[menus])

# Group by email and sum to get order frequency per user
menu_count_summary = menu_count_per_user.groupby("email").sum()

# Calculate the total count of all menu items per user
total_menu_counts = menu_count_summary.sum(axis=1)

# Calculate the proportion of each menu count over the total count per user
menu_proportion_per_user = menu_count_summary.div(total_menu_counts, axis=0)

In [62]:
new_orders_df = pd.concat([time_proportion_per_user,menu_proportion_per_user, average_summary], axis=1)
new_orders_df

Unnamed: 0_level_0,early,morning,afternoon,evening,menu_Black,menu_Cappuccino,menu_Latte,menu_Americano,menu_Espresso,sweetness,shots,milk
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ba@na.naa,1.0,0.0,0.0,0.0,0.0,0.333333,0.666667,0.0,0.0,0.4,0.625,1.0
eieieie@iid.com,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.8,0.0,1.0
fei@gmail.com,0.433333,0.4,0.166667,0.0,0.0,0.166667,0.125,0.666667,0.041667,0.15,0.166667,0.433333
pim@mat.com,1.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.333333,0.5,0.228571,0.714286,0.0
try@gmail.com,1.0,0.0,0.0,0.0,,,,,,0.8,0.0,0.0
user10@example.com,0.307692,0.538462,0.076923,0.076923,0.230769,0.307692,0.0,0.384615,0.076923,0.530769,0.432692,0.307692
user11@example.com,0.2,0.0,0.4,0.4,0.2,0.2,0.6,0.0,0.0,0.5,0.75,0.8
user12@example.com,0.0,0.2,0.8,0.0,0.6,0.0,0.0,0.0,0.4,0.6,0.625,0.0
user13@example.com,0.0,0.8,0.2,0.0,0.6,0.4,0.0,0.0,0.0,0.2,0.5,0.2
user14@example.com,0.0,0.6,0.4,0.0,0.2,0.4,0.0,0.2,0.2,0.5,0.5,0.2


In [63]:
merged_df = pd.merge(new_orders_df, users_df, on='email')
merged_df

Unnamed: 0,email,early,morning,afternoon,evening,menu_Black,menu_Cappuccino,menu_Latte,menu_Americano,menu_Espresso,...,Liver Disease,Sleep Disorder,preferCream,preferSweet,preferCaffeine,ageGroup_18-30,ageGroup_31-40,ageGroup_41-50,ageGroup_51-60,ageGroup_61+
0,ba@na.naa,1.0,0.0,0.0,0.0,0.0,0.333333,0.666667,0.0,0.0,...,0,1,1.0,0.0,1.0,False,False,True,False,False
1,fei@gmail.com,0.433333,0.4,0.166667,0.0,0.0,0.166667,0.125,0.666667,0.041667,...,0,1,1.0,0.333333,0.5,True,False,False,False,False
2,pim@mat.com,1.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.333333,0.5,...,0,1,0.0,0.666667,1.0,False,False,False,False,True
3,user10@example.com,0.307692,0.538462,0.076923,0.076923,0.230769,0.307692,0.0,0.384615,0.076923,...,0,0,1.0,0.333333,0.0,False,True,False,False,False
4,user11@example.com,0.2,0.0,0.4,0.4,0.2,0.2,0.6,0.0,0.0,...,0,0,0.0,0.0,1.0,False,True,False,False,False
5,user12@example.com,0.0,0.2,0.8,0.0,0.6,0.0,0.0,0.0,0.4,...,1,0,1.0,1.0,0.0,False,False,False,True,False
6,user13@example.com,0.0,0.8,0.2,0.0,0.6,0.4,0.0,0.0,0.0,...,0,0,1.0,1.0,1.0,False,True,False,False,False
7,user14@example.com,0.0,0.6,0.4,0.0,0.2,0.4,0.0,0.2,0.2,...,0,0,0.0,0.333333,0.5,False,True,False,False,False
8,user15@example.com,0.2,0.2,0.4,0.2,0.0,0.2,0.0,0.6,0.2,...,0,0,1.0,0.0,0.0,True,False,False,False,False
9,user16@example.com,0.2,0.0,0.2,0.6,0.0,0.0,0.6,0.2,0.2,...,1,0,0.0,0.0,1.0,True,False,False,False,False


In [64]:
print(list(merged_df.columns))

['email', 'early', 'morning', 'afternoon', 'evening', 'menu_Black', 'menu_Cappuccino', 'menu_Latte', 'menu_Americano', 'menu_Espresso', 'sweetness', 'shots', 'milk', 'Diabetes', 'Gas Reflux', 'Heart Disease', 'High Pressure', 'Lactose Intolerance', 'Liver Disease', 'Sleep Disorder', 'preferCream', 'preferSweet', 'preferCaffeine', 'ageGroup_18-30', 'ageGroup_31-40', 'ageGroup_41-50', 'ageGroup_51-60', 'ageGroup_61+']


***

# Model Training

In [65]:
# Train-Test Split and Evaluation
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

Collaborative Filtering

In [66]:
# Fit KNN model
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(merged_df.drop(columns=['email']).values)
merged_df.set_index('email', inplace=True)

In [67]:
import random

def random_menu():
    menus = ["Black", "Cappuccino", "Latte", "Americano", "Espresso"]
    return random.choice(menus)

In [68]:
def recommend_menu(email):
    if email not in users_df.index:
        return [random_menu()]  # No data available

    # Find similar users
    distances, indices = knn.kneighbors(merged_df.loc[email].values.reshape(1, -1))
    similar_users = merged_df.iloc[indices[0][1:]].index  #skip the first one (user themself)

    # Ensure similar users exist in new_orders_df
    similar_users = [user for user in similar_users if user in merged_df.index]

    if not similar_users:
        return [random_menu()]  # No similar users found in new_orders_df

    # Use preprocessed order data (new_orders_df) for recommendations
    recommended_menus = new_orders_df.loc[similar_users].sum().filter(like='menu_').sort_values(ascending=False).index.str.replace('menu_', '').tolist()

    return recommended_menus[:3]

In [70]:
# Define function to generate ingredient recommendations
def recommend_ingredients(email):
    if email in users_df.index:
        # Retrieve preprocessed ingredient values directly from merged_df
        avg_sweetness = avg_ingredients.iloc[email, 'sweetness']
        avg_shots = avg_ingredients.iloc[email, 'shots']
    else:
        # Fallback to user preferences if no historical data
        user_prefs = prefer_ingredients.loc[prefer_ingredients['email'] == email, ['preferCaffeine', 'preferSweet']].iloc[0]
        avg_sweetness, avg_shots = user_prefs['preferSweet'], user_prefs['preferCaffeine']

    return round(int(avg_sweetness)), round(int(avg_shots))

In [2]:
# Define health rule function
def apply_health_rules(email, menu, sweetness, shots):
    # Check if email exists in users_df
    if email not in users_df['email'].values:
        raise ValueError(f"Email {email} not found in users_df.")

    # Get user's health conditions as one-hot encoded values
    user_health_conditions = users_df[users_df['email'] == email].iloc[0]

    # Define health adjustment groups
    less_sugar_conditions = {'High Pressure', 'Diabetes', 'Liver Disease', 'Heart Disease', 'Sleep Disorder'}
    less_shots_conditions = {'High Pressure', 'Sleep Disorder', 'Gas Reflux'}

    # Adjust sweetness
    if any(user_health_conditions.get(cond, 0) == 1 for cond in less_sugar_conditions):
        if sweetness != 0:
            sweetness = 1

    # Adjust shots
    if any(user_health_conditions.get(cond, 0) == 1 for cond in less_shots_conditions):
        shots = max(1, min(shots - 1, 3))

    return sweetness, shots

In [74]:
def format_recommendation(email, menu, sweetness, shots, milk):
    return {
        "email": email,
        "menu": menu,
        "shots": shots,
        "sweetness": sweetness,
        "milk": milk
    }

In [75]:
# Define function to get final recommendation
def get_final_recommendation(email):
    # Get menu recommendations using collaborative filtering
    recommendations = recommend_menu(email)
    if not recommendations:
        return None  # No recommendations available

    # Generate ingredient recommendations based on collaborative filtering
    adjusted_sweetness, adjusted_shots = recommend_ingredients(email)

    # Apply health rules
    selected_menu = recommendations[0]  # Use the top recommendation
    adjusted_sweetness, adjusted_shots = apply_health_rules(
        email,
        selected_menu,
        adjusted_sweetness,
        adjusted_shots,
    )

    if selected_menu == 'Cappuccino': milk = 1
    elif selected_menu == 'Latte': milk = 2
    else: milk = 0;

    return format_recommendation(
        email=email,
        menu=selected_menu,
        sweetness=adjusted_sweetness*50,
        shots=adjusted_shots,
        milk=milk
    )

# Test Results

In [82]:
# check result
for email in test_df['email']:
    recommendation = get_final_recommendation(email)
    if recommendation:
        print(recommendation)

{'email': 'user2@example.com', 'menu': 'Espresso', 'shots': 3, 'sweetness': 50, 'milk': 0}
{'email': 'user16@example.com', 'menu': 'Black', 'shots': 2, 'sweetness': 0, 'milk': 0}
{'email': 'ba@na.naa', 'menu': 'Latte', 'shots': 2, 'sweetness': 0, 'milk': 2}
{'email': 'user15@example.com', 'menu': 'Americano', 'shots': 1, 'sweetness': 0, 'milk': 0}
{'email': 'user4@example.com', 'menu': 'Espresso', 'shots': 1, 'sweetness': 50, 'milk': 0}


# Recommend to Firestore

In [83]:
def send_to_firestore(recommendation):
    # Convert NumPy types to native Python types
    for key, value in recommendation.items():
        if isinstance(value, np.int64):
            recommendation[key] = int(value)

    doc_ref = db.collection('recommendations').document(str(recommendation['email']))
    doc_ref.set(recommendation)
    print(f"Recommendation sent to Firestore for {recommendation['email']}")

In [84]:
# Test the Complete Pipeline
for email in users_df['email']:
    recommendation = get_final_recommendation(email)
    if recommendation:
      send_to_firestore(recommendation)
      print(f"Recommendation sent to Firestore for {recommendation['email']}")
    else:
      print(f"No recommendation found for {email}")

Recommendation sent to Firestore for user1@example.com
Recommendation sent to Firestore for user1@example.com
Recommendation sent to Firestore for user12@example.com
Recommendation sent to Firestore for user12@example.com
Recommendation sent to Firestore for user15@example.com
Recommendation sent to Firestore for user15@example.com
Recommendation sent to Firestore for user9@example.com
Recommendation sent to Firestore for user9@example.com
Recommendation sent to Firestore for user6@example.com
Recommendation sent to Firestore for user6@example.com
Recommendation sent to Firestore for ba@na.naa
Recommendation sent to Firestore for ba@na.naa
Recommendation sent to Firestore for user14@example.com
Recommendation sent to Firestore for user14@example.com
Recommendation sent to Firestore for user4@example.com
Recommendation sent to Firestore for user4@example.com
Recommendation sent to Firestore for user13@example.com
Recommendation sent to Firestore for user13@example.com
Recommendation sen