In [1]:
!pip install transformers pyvis
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.3.0+cu121.html
!pip install torch_geometric

Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jedi>=0.16 (from ipython>=5.3.0->pyvis)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi, pyvis
Successfully installed jedi-0.19.1 pyvis-0.3.2
Looking in links: https://data.pyg.org/whl/torch-2.3.0+cu121.html
Collecting pyg_lib
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/pyg_lib-0.4.0%2Bpt23cu121-cp310-cp310-linux_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_scatter
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_scatter-2.1.2%2Bpt23cu121-cp310-cp310-linux_x86_64.whl (10.9 

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
from transformers import AutoModel
import torch
from pyvis.network import Network
from numpy.linalg import norm
import json
import pandas as pd
from tqdm import tqdm
import networkx as nx
from IPython.core.display import display, HTML
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from sklearn.cluster import DBSCAN
from collections import defaultdict

from torch import Tensor
from torch_geometric.data import HeteroData

In [4]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
device = DEVICE
device

'cuda'

## Part 1: Making the dataset

In [5]:
emb_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True) # trust_remote_code is needed to use the encode method
emb_model = emb_model.to(DEVICE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

configuration_bert.py:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py:   0%|          | 0.00/97.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/275M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [6]:
def get_embedding_from_model(string: str) -> list[float]:
  with torch.inference_mode():
    embeddings = emb_model.encode([string])
  return embeddings[0]

In [7]:
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))

In [8]:
def load_data_from_json(path_to_json: str) -> list[dict]:

    with open(path_to_json, "r") as file:
        data = json.load(file)

    return data

In [9]:
data = load_data_from_json(path_to_json=r'/content/drive/MyDrive/VNP_PROJECT/1k.json')
data = data[:1000]

In [10]:
def get_ingrs_for_recipe(recipe: dict) -> tuple[str, list[str]]:
  return recipe['title'], [ingr['name'] for ingr in recipe['ingredients']]

In [11]:
data_dict = {
    'ingredient': [],
    'recipe': []
}

In [12]:
for recipe in data:
  recipe_name, ingrs = get_ingrs_for_recipe(recipe)
  for ingr in ingrs:
    data_dict['ingredient'].append(ingr)
    data_dict['recipe'].append(recipe_name)

In [13]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,ingredient,recipe
0,penne,Worlds Best Mac and Cheese
1,cheese sauce,Worlds Best Mac and Cheese
2,cheddar cheese,Worlds Best Mac and Cheese
3,gruyere cheese,Worlds Best Mac and Cheese
4,dried chipotle powder,Worlds Best Mac and Cheese
...,...,...
8967,green beans,Turkey and Stuffing Casserole
8968,cream of chicken soup,Turkey and Stuffing Casserole
8969,milk,Turkey and Stuffing Casserole
8970,seasoned stuffing mix,Turkey and Stuffing Casserole


In [14]:
food_recipe_graph = nx.DiGraph()

In [15]:
for index, row in df.iterrows():
  food_recipe_graph.add_edge(row['recipe'], row['ingredient'], label='has_ingr')

In [16]:
net = Network(notebook=True, cdn_resources='in_line', directed=True)
net.from_nx(nx_graph=food_recipe_graph)

for node in df['recipe'].unique():
    net.get_node(node)['color'] = 'rgba(255,182,193,0.5)'


for node in net.nodes:
    node['font'] = {'size': 12}

for edge in net.edges:
    edge['font'] = {'size': 10}


net.save_graph('example.html')

In [17]:
# with open('example.html', 'r') as file:
#     html_content = file.read()

# display(HTML(html_content))

In [18]:
def get_corr_matrix_from_model(list_of_strings: list[str]) -> list[list[float]]:
  with torch.inference_mode():
    embeddings = emb_model.encode(list_of_strings)
  return embeddings

In [19]:
unique_ingrs = df['ingredient'].unique()
corr_matrix = get_corr_matrix_from_model(unique_ingrs)
cos_matrix = cosine_similarity(corr_matrix)
distance_matrix = 1 - cos_matrix
distance_matrix[distance_matrix < 0] = 0

In [20]:
dbscan = DBSCAN(eps=0.04, min_samples=2, metric='precomputed') # eps = 1 - 0.96, where 0.96 is the treshold
labels = dbscan.fit_predict(distance_matrix)

clusters = defaultdict(list)
for idx, label in enumerate(labels):
    clusters[label].append(unique_ingrs[idx])

for cluster_id, items in clusters.items():
    if cluster_id != -1:
        print(f'Cluster {cluster_id}: {set(items)}')

Cluster 0: {'all - purpose flour', 'all - purpose white flour'}
Cluster 1: {'green bell peppers', 'red peppers', 'red bell peppers', 'green pepper', 'bell peppers', 'red bell pepper', 'green peppers', 'orange bell pepper', 'red chili pepper', 'bell pepper', 'red pepper', 'green bell pepper'}
Cluster 2: {'dried dill weed', 'dry dill weed'}
Cluster 3: {'red onion', 'red onions'}
Cluster 4: {'yellow pepper', 'yellow bell pepper', 'yellow bell peppers'}
Cluster 5: {'cucumber', 'cucumbers'}
Cluster 6: {'fresh basil leaf', 'fresh basil', 'fresh basil leaves'}
Cluster 7: {'graham cracker crust', 'graham cracker crumb crust'}
Cluster 8: {'salt and pepper', 'salt and black pepper', 'salt & pepper'}
Cluster 9: {'soy sauce', 'soya sauce'}
Cluster 10: {'cashews', 'cashew nuts'}
Cluster 11: {'low - fat vanilla yogurt', 'non - fat vanilla yogurt'}
Cluster 12: {'green onions', 'green onion'}
Cluster 13: {'fennel bulb', 'fennel bulbs'}
Cluster 14: {'reduced - sodium fat - free chicken broth', 'reduced

In [21]:
def switch_edge(edges: list[tuple[str, str, dict]], node: str, index: int = 0) -> list[tuple[str, str, str]]:
  changed_edges = []
  for edge in edges:
    x, y, data_dict = edge
    label = data_dict['label']
    changed_edges.append((node, y, label) if index==0 else (x, node, label))
  return changed_edges

In [22]:
to_switch_matrix = [list(set(items)) for cluster_id, items in clusters.items()]
for class_of_similar_nodes in to_switch_matrix:

  similar_nodes = class_of_similar_nodes[1:]
  main_node = class_of_similar_nodes[0]

  for node in similar_nodes:
    outgoing_edges = list(food_recipe_graph.out_edges(node, data=True))
    incoming_edges = list(food_recipe_graph.in_edges(node, data=True))

    food_recipe_graph.remove_node(node)

    changed_incoming = switch_edge(incoming_edges, main_node, 1)
    changed_outgoing = switch_edge(outgoing_edges, main_node, 0)

    for u, v, label in changed_incoming + changed_outgoing:
      food_recipe_graph.add_edge(u, v, label=label)

In [23]:
net = Network(notebook=True, cdn_resources='in_line', directed=True)
net.from_nx(nx_graph=food_recipe_graph)

for node in df['recipe'].unique():
    net.get_node(node)['color'] = 'rgba(255,182,193,0.5)'


for node in net.nodes:
    node['font'] = {'size': 12}

for edge in net.edges:
    edge['font'] = {'size': 10}


net.save_graph('example2.html')

In [24]:
# with open('example2.html', 'r') as file:
#     html_content = file.read()

# display(HTML(html_content))

## Part 2: Combining both Graphs

Add the recipe subs graph and connect it to the existing graph by using cosine similarity if sim(node_i(food_recipe_graph), node_j(subs_graph)) > 0.95, we are talking about the same node so connect them.

Also need to make a dataframe and a heterodata graph so as to make a GNN so it can learn label prediction (is something a valid sub).

In [25]:
subs_df = pd.read_csv('/content/drive/MyDrive/VNP_PROJECT/whole_wrangled.csv')
subs_df['Also known as'] = subs_df['Also known as'].astype(str)
subs_df['Substitutions'] = subs_df['Substitutions'].astype(str)

In [26]:
subs_df

Unnamed: 0,Name,Description,Also known as,Scientific name,Substitutions,Food Type
0,A1 Sauce,This is a steak sauce similar to Pickapeppa an...,steak sauce,,"barbecue sauce,ketchup",flavorings
1,absinthe,This potent anise-flavored liqueur contains th...,absynthe,,"Herbsaint,Pernod,anisette",flavorings
2,acesulfame K,An artifical sweetner. Popular brands are Sune...,"Sunnett, Sweet One",,"advantame,aspartame,saccarin,sucralose,Neotame",flavorings
3,acidulated water,This is water that's been mixed with a small a...,,,water,flavorings
4,adobo sauce,,,,Tabasco sauce,flavorings
...,...,...,...,...,...,...
4418,weisswurst,These are mildly seasoned German veal sausages...,"weißwürste, white sausage",Bos taurus,"bockwurst,bratwurst,boudin blanc",meats
4419,Westphalian ham,This choice German ham is smoked over beechwoo...,Westfalischer Schinken,Sus scrofa domesticus,"Black Forest ham,prosciutto,country ham,Ardenn...",meats
4420,wunderwurst,This is liverwurst dotted with pistachios.,,,liverwurst,meats
4421,York ham,"This is a lightly smoked, dry-cured British ha...",,Sus scrofa domesticus,"Black Forest ham,prosciutto,country ham,Ardenn...",meats


In [27]:
food_subs_graph = nx.DiGraph()

In [28]:
for index, row in list(subs_df.iterrows()):
  if row['Substitutions'] != 'nan':
    subs = row['Substitutions'].split(',')
    for sub in subs:
      food_subs_graph.add_edge(row['Name'], sub, label='has_sub')
  if row['Also known as'] != 'nan':
    akas = row['Also known as'].split(', ')
    for aka in akas:
      food_subs_graph.add_edge(row['Name'], aka, label='also_known_as')

In [29]:
net = Network(notebook=True, cdn_resources='in_line', directed=True)
net.from_nx(nx_graph=food_subs_graph)
net.save_graph('subs.html')

In [30]:
# with open('subs.html', 'r') as file:
#     html_content = file.read()

# display(HTML(html_content))

To do: connect the two graphs using cosine similarity

In [31]:
connected_df = df.copy()

In [32]:
connected_df

Unnamed: 0,ingredient,recipe
0,penne,Worlds Best Mac and Cheese
1,cheese sauce,Worlds Best Mac and Cheese
2,cheddar cheese,Worlds Best Mac and Cheese
3,gruyere cheese,Worlds Best Mac and Cheese
4,dried chipotle powder,Worlds Best Mac and Cheese
...,...,...
8967,green beans,Turkey and Stuffing Casserole
8968,cream of chicken soup,Turkey and Stuffing Casserole
8969,milk,Turkey and Stuffing Casserole
8970,seasoned stuffing mix,Turkey and Stuffing Casserole


In [33]:
subs_df

Unnamed: 0,Name,Description,Also known as,Scientific name,Substitutions,Food Type
0,A1 Sauce,This is a steak sauce similar to Pickapeppa an...,steak sauce,,"barbecue sauce,ketchup",flavorings
1,absinthe,This potent anise-flavored liqueur contains th...,absynthe,,"Herbsaint,Pernod,anisette",flavorings
2,acesulfame K,An artifical sweetner. Popular brands are Sune...,"Sunnett, Sweet One",,"advantame,aspartame,saccarin,sucralose,Neotame",flavorings
3,acidulated water,This is water that's been mixed with a small a...,,,water,flavorings
4,adobo sauce,,,,Tabasco sauce,flavorings
...,...,...,...,...,...,...
4418,weisswurst,These are mildly seasoned German veal sausages...,"weißwürste, white sausage",Bos taurus,"bockwurst,bratwurst,boudin blanc",meats
4419,Westphalian ham,This choice German ham is smoked over beechwoo...,Westfalischer Schinken,Sus scrofa domesticus,"Black Forest ham,prosciutto,country ham,Ardenn...",meats
4420,wunderwurst,This is liverwurst dotted with pistachios.,,,liverwurst,meats
4421,York ham,"This is a lightly smoked, dry-cured British ha...",,Sus scrofa domesticus,"Black Forest ham,prosciutto,country ham,Ardenn...",meats


In [34]:
sub_names = subs_df['Name'].values

In [35]:
subs_embeddings = []
for index, row in tqdm(list(subs_df.iterrows())):
  all_subs = [row['Name']] + row['Substitutions'].split(',')
  try:
    all_subs.remove('nan')
  except:
    ...
  subs_embeddings.append(list(map(get_embedding_from_model, all_subs)))

100%|██████████| 4423/4423 [02:47<00:00, 26.41it/s]


In [36]:
# Step 1: Collect all ingredient embeddings and substitute embeddings
ingr_embeddings = np.array([get_embedding_from_model(row['ingredient']) for _, row in connected_df.iterrows()])
subs_embeddings_flat = np.vstack(subs_embeddings)  # Flatten list of lists to a single matrix

# Step 2: Normalize the embeddings
ingr_embeddings_norm = ingr_embeddings / np.linalg.norm(ingr_embeddings, axis=1, keepdims=True)
subs_embeddings_norm = subs_embeddings_flat / np.linalg.norm(subs_embeddings_flat, axis=1, keepdims=True)

# Step 3: Matrix multiplication to compute cosine similarities
cos_sim_matrix = np.dot(ingr_embeddings_norm, subs_embeddings_norm.T)

# Step 4: Precompute the index ranges for each substitute type
sub_indices = np.cumsum([len(emb_list) for emb_list in subs_embeddings])

# Step 5: Filter results based on threshold
threshold = 0.96
new_list_of_subs = []

for i, row in tqdm(enumerate(connected_df.itertuples()), total=len(connected_df)):
    similar_indices = np.where(cos_sim_matrix[i] > threshold)[0]
    for idx in similar_indices:
        # Find the corresponding substitute type using precomputed indices
        sub_type_idx = np.searchsorted(sub_indices, idx, side='right')
        new_list_of_subs.append((row.ingredient, sub_names[sub_type_idx]))

100%|██████████| 8972/8972 [00:00<00:00, 13884.37it/s]


In [37]:
# new_list_of_subs = []
# for _, row in tqdm(list(connected_df.iterrows())):
#   ingr_emb = get_embedding_from_model(row['ingredient'])
#   for index, subs_embeddings_of_similar_type in enumerate(subs_embeddings): # index of subs matrix
#     for sub_emb in subs_embeddings_of_similar_type:
#       cos_score = cos_sim(ingr_emb, sub_emb)
#       if cos_score  > 0.96:
#         #print(f"{row['ingredient']} is similar to {sub_names[index]}")
#         new_list_of_subs.append((row['ingredient'], sub_names[index]))
#         break

All class items vs. all subs

In [38]:
new_edges_dict = {
    'Ingredient': [],
    'Substitution': []
}
for ingr, sub in new_list_of_subs:
  new_edges_dict['Ingredient'].append(ingr)
  new_edges_dict['Substitution'].append(sub)

In [39]:
new_edges_df = pd.DataFrame(new_edges_dict)

In [40]:
new_edges_df.to_csv('edges.csv')

In [41]:
new_edges_df

Unnamed: 0,Ingredient,Substitution
0,penne,cannolicchi
1,penne,conchiglie
2,penne,elicoidali
3,penne,fusilli
4,penne,garganelli
...,...,...
81055,green beans,yard-long bean
81056,pepper,garam masala
81057,pepper,Pepper
81058,pepper,Salt


In [42]:
new_edges_df.groupby('Ingredient').agg(lambda x: "|".join(x))

Unnamed: 0_level_0,Substitution
Ingredient,Unnamed: 1_level_1
2% low - fat milk,"milk 2%, Low-fat|milk 2%, Low-fat|milk 2%, Low..."
American cheese,American cheese|Cheddar|Colby|processed cheese...
Angostura bitters,Angostura® bitters|Peychaud's bitters|Angostur...
Anjou pear,Anjou pear|Bartlett pear|Comice pear|pome Fruit
Belgian endive,arugula|Belgian endive|cress|endigia|endive|pu...
...,...
yellow squash,cucuzza|opo squash|pattypan squash|yellow squa...
yogurt,mayonnaise|silken tofu|silken tofu|buttermilk|...
yukon gold potato,Bintje potato|Caribe potato|purple potato|red-...
yukon gold potatoes,Bintje potato|Caribe potato|purple potato|red-...


In [43]:
composed_graph = nx.compose(food_recipe_graph, food_subs_graph)

In [44]:
for index, row in new_edges_df.iterrows():
  ingr = row['Ingredient']
  sub = row['Substitution']

  for cluster_id, items in clusters.items():
    if cluster_id != -1:
        items_set = set(items)
        if ingr in items_set:
          ingr = items[0]
          break

  composed_graph.add_edge(ingr, sub, label='has_sub')

In [45]:
net = Network(notebook=True, cdn_resources='in_line', directed=True)
net.from_nx(nx_graph=composed_graph)
net.save_graph('composed.html')

In [46]:
# with open('composed.html', 'r') as file:
#     html_content = file.read()

# display(HTML(html_content))

In [47]:
nx.write_graphml(composed_graph, 'graph.graphml')

In [48]:
nodes_df = pd.DataFrame.from_dict(dict(composed_graph.nodes(data=True)), orient='index').reset_index()
nodes_df.columns = ['Node'] + list(nodes_df.columns[1:])
edges_df = nx.to_pandas_edgelist(composed_graph)

In [49]:
nodes_df

Unnamed: 0,Node,size
0,,10
1,,10
2,Mexican,10
3,Spanish,10
4,artifical,10
...,...,...
9933,zuckerhut,10
9934,zungenwurst,10
9935,zwieback,10
9936,écrevisse,10


In [50]:
edges_df.query('source=="penne"')

Unnamed: 0,source,target,width,label
14731,penne,mostaccioli,1,has_sub
14732,penne,ziti,1,has_sub
14733,penne,rigatoni,1,has_sub
14734,penne,macaroni,1,has_sub
14735,penne,elicoidali,1,has_sub
14736,penne,ditali,1,has_sub
14737,penne,fusilli,1,has_sub
14738,penne,Penne lisce (has smooth walls),1,also_known_as
14739,penne,Penne rigate (has ridges,1,also_known_as
14740,penne,the better to hold sauces),1,also_known_as


In [51]:
with open('/content/drive/MyDrive/VNP_PROJECT/graph.graphml', 'r') as file:
  gml_string = file.read()

In [52]:
graph = nx.parse_graphml(gml_string)

In [53]:
nodes_df = pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient='index').reset_index()
nodes_df.columns = ['Node'] + list(nodes_df.columns[1:])
edges_df = nx.to_pandas_edgelist(graph)

del nodes_df['size']
del edges_df['width']

In [54]:
nodes_df

Unnamed: 0,Node
0,
1,
2,Mexican
3,Spanish
4,artifical
...,...
9933,zuckerhut
9934,zungenwurst
9935,zwieback
9936,écrevisse


In [55]:
edges_df

Unnamed: 0,source,target,label
0,Worlds Best Mac and Cheese,all - purpose flour,has_ingr
1,Worlds Best Mac and Cheese,dark chocolate chips,has_ingr
2,all - purpose flour,Instant flour,has_sub
3,all - purpose flour,all-purpose flour,has_sub
4,all - purpose flour,amaranth flour,has_sub
...,...,...,...
18937,beef bouillon cube,onion soup mix,has_sub
18938,beef bouillon cube,red miso,has_sub
18939,beef bouillon cube,beef base,has_sub
18940,beef bouillon cube,beef bouillon cubes,has_sub


In [56]:
list_of_recipes_from_df = set(edges_df.query('label=="has_ingr"')['source'].values)

In [57]:
nodes_df['type'] = nodes_df['Node'].apply(lambda elem: 'recipe' if elem in list_of_recipes_from_df else 'ingr')

In [58]:
nodes_df = nodes_df.sort_values(by='type')

In [59]:
ingr_counts = nodes_df['type'].value_counts()['ingr']
recipe_counts = nodes_df['type'].value_counts()['recipe']

In [60]:
nodes_df['index'] = list(range(ingr_counts)) + list(range(recipe_counts))

In [61]:
nodes_df

Unnamed: 0,Node,type,index
0,,ingr,0
6620,manila bean,ingr,1
6621,manila clams,ingr,2
6622,manioc,ingr,3
6623,manioc flour,ingr,4
...,...,...,...
1404,Kombu Tea Grilled Chicken Thigh,recipe,990
1421,Lady Bird Johnson's Spoon Bread,recipe,991
1427,Lamb Chops With Garlic and Herb Crust,recipe,992
1429,Lamb Curry,recipe,993


In [62]:
edges_df['label'] = edges_df['label'].astype(str)

In [63]:
edges_df.query("label=='nan'")

Unnamed: 0,source,target,label


In [64]:
ingrs = nodes_df.query('type=="ingr"')['Node'].values.tolist()

In [65]:
ingrs[:10]

['',
 'manila bean',
 'manila clams',
 'manioc',
 'manioc flour',
 'manoomin',
 'manti',
 'manzana chile',
 'manzana chili',
 'mape']

In [66]:
data = HeteroData()

In [67]:
ingrs_emb = list(map(get_embedding_from_model, ingrs))

In [68]:
len(ingrs_emb[1])

768

In [69]:
data['ingr'].x = Tensor(np.array(ingrs_emb)).to(dtype=torch.float32)

In [70]:
recipes_df = nodes_df.query('type=="recipe"')['Node'].values.tolist()

In [71]:
data['recipe'].x = Tensor(np.array(list(map(get_embedding_from_model, recipes_df)))).to(dtype=torch.float32)

In [72]:
node_to_id_dict = {
    k: v for k,v in nodes_df[['Node', 'index']].values
}

In [73]:
edges_df['index'] = edges_df['source'].apply(lambda node: node_to_id_dict[node])
edges_df['index_target'] = edges_df['target'].apply(lambda node: node_to_id_dict[node])

In [74]:
merged_df = edges_df

In [75]:
print(merged_df['index'].max())
print(merged_df['index'].min())

8942
0


In [76]:
def get_edge_index_from_label_type(label: str) -> np.array:
  return merged_df[['index', 'label', 'index_target']] \
    .query('label==' + '"' + label + '"')[['index', 'index_target']] \
    .values.reshape(2, -1)

In [77]:
get_edge_index_from_label_type('has_sub')

array([[5851, 6536, 5851, ..., 7406, 2146, 7406],
       [3014, 7406, 8005, ..., 8413, 4405, 4385]])

In [78]:
data['recipe', 'has_ingr', 'ingr'].edge_index = Tensor(get_edge_index_from_label_type('has_ingr')).to(dtype=torch.int64)
data['ingr', 'also_known_as', 'ingr'].edge_index =  Tensor(get_edge_index_from_label_type('also_known_as')).to(dtype=torch.int64)
data['ingr', 'has_sub', 'ingr'].edge_index =  Tensor(get_edge_index_from_label_type('has_sub')).to(dtype=torch.int64)

In [79]:
data['recipe']['num_nodes'] = data['recipe'].x.shape[0]
data['ingr']['num_nodes'] = data['ingr'].x.shape[0]

In [80]:
data

HeteroData(
  ingr={
    x=[8943, 768],
    num_nodes=8943,
  },
  recipe={
    x=[995, 768],
    num_nodes=995,
  },
  (recipe, has_ingr, ingr)={ edge_index=[2, 2393] },
  (ingr, also_known_as, ingr)={ edge_index=[2, 5618] },
  (ingr, has_sub, ingr)={ edge_index=[2, 10931] }
)

## Train and Eval GNN (RecSys)

In [81]:
import torch
from torch.nn.functional import mse_loss, binary_cross_entropy_with_logits
from torch_geometric.nn import to_hetero
from torch_geometric.nn import Linear, SAGEConv
from torch.optim import SGD
from torch_geometric.datasets import AmazonBook
from torch_geometric.transforms import RandomLinkSplit
from sklearn.metrics import classification_report, roc_auc_score
from torch_geometric.utils import negative_sampling
from torch.utils.data import DataLoader
from torch_geometric.nn import LightGCN
from torch.optim import Adam

In [82]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels, start, to):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

        self.start = start
        self.to = to

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict[self.start][row], z_dict[self.to][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
  def __init__(self, hidden_channels, data, start, to):
      super().__init__()
      self.encoder = GNNEncoder(hidden_channels, hidden_channels)
      self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
      self.decoder = EdgeDecoder(hidden_channels=hidden_channels, start=start, to=to)

  def forward(self, x_dict, edge_index_dict, edge_label_index):
      z_dict = self.encoder(x_dict, edge_index_dict)
      return self.decoder(z_dict, edge_label_index)


In [83]:
def train_link_prediction(model, train_data, val_data, optimizer, start, to, epochs=5):
    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        pred = model(train_data.x_dict, train_data.edge_index_dict,
                     train_data[start,'has_sub', to].edge_label_index)

        target = train_data[start,'has_sub', to].edge_label
        loss = mse_loss(pred, target)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.inference_mode():
          pred = model(val_data.x_dict, val_data.edge_index_dict,
                      val_data[start,'has_sub', to].edge_label_index)
        pred = pred.clamp(min=0, max=1)
        target = val_data[start,'has_sub', to].edge_label.float()
        val_loss = mse_loss(pred, target).sqrt()

        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}')

def test_link_prediction(model, test_data, start, to):
    model = model.to(device)
    model.eval()

    with torch.inference_mode():
        pred = model(test_data.x_dict, test_data.edge_index_dict,
                     test_data[start, 'has_sub', to].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = test_data[start, 'has_sub', to].edge_label.float()

    y_true = target.cpu().numpy()
    y_pred = pred.round().detach().cpu().numpy()

    print(classification_report(y_true=y_true, y_pred=y_pred, digits=4))

    # Compute ROC AUC score
    roc_auc = roc_auc_score(y_true, y_pred)
    print(f'ROC AUC: {roc_auc:.4f}')

    val_loss = mse_loss(pred, target).sqrt()
    print(f'Loss: {val_loss:.4f}')

In [84]:
# edge_index = data['ingr', 'has_sub', 'ingr'].edge_index
# ingr_num_nodes = data['ingr']['num_nodes']
# # (ingr, has_sub, ingr)={ edge_index=[2, 10931] }

# neg_edge_index = negative_sampling(edge_index, num_nodes=ingr_num_nodes)

# # Combine positive and negative edges
# edge_label_index = torch.cat([edge_index, neg_edge_index], dim=1)

# # Create labels for edges (1 for existing edges, 0 for non-existent edges)
# # num_pos_edges = edge_index.size(1)
# # num_neg_edges = neg_edge_index.size(1)
# # edge_labels = torch.cat([torch.ones(num_pos_edges), torch.zeros(num_neg_edges)], dim=0)

In [85]:
# data['ingr', 'has_sub', 'ingr'].edge_label_index = edge_label_index

In [86]:
data_copy = data.clone()

In [87]:
del data_copy['recipe']
del data_copy['recipe', 'has_ingr', 'ingr']
del data_copy['ingr', 'also_known_as', 'ingr']

In [88]:
data_copy

HeteroData(
  ingr={
    x=[8943, 768],
    num_nodes=8943,
  },
  (ingr, has_sub, ingr)={ edge_index=[2, 10931] }
)

In [89]:
start, forward_relation, to = 'ingr', 'has_sub', 'ingr'

In [90]:
train_val_test_split = RandomLinkSplit(num_val=0.2,
                                      num_test=0.2,
                                      add_negative_train_samples=True,
                                      edge_types=(start, forward_relation, to))

In [91]:
train_data, val_data, test_data = train_val_test_split(data_copy)
model_GNN = Model(hidden_channels=512, data=data_copy, start=start, to=to)
optimizer = SGD(model_GNN.parameters(), lr=0.01)

In [92]:
train_link_prediction(model_GNN.to(device), train_data.to(device), val_data.to(device), optimizer, start, to, 1000)

Epoch: 000, Loss: 0.3912, Val Loss: 0.5698
Epoch: 001, Loss: 0.3202, Val Loss: 0.5250
Epoch: 002, Loss: 0.2689, Val Loss: 0.4901
Epoch: 003, Loss: 0.2318, Val Loss: 0.4637
Epoch: 004, Loss: 0.2054, Val Loss: 0.4444
Epoch: 005, Loss: 0.1869, Val Loss: 0.4306
Epoch: 006, Loss: 0.1741, Val Loss: 0.4207
Epoch: 007, Loss: 0.1652, Val Loss: 0.4135
Epoch: 008, Loss: 0.1587, Val Loss: 0.4079
Epoch: 009, Loss: 0.1537, Val Loss: 0.4035
Epoch: 010, Loss: 0.1496, Val Loss: 0.3996
Epoch: 011, Loss: 0.1460, Val Loss: 0.3962
Epoch: 012, Loss: 0.1429, Val Loss: 0.3931
Epoch: 013, Loss: 0.1399, Val Loss: 0.3902
Epoch: 014, Loss: 0.1372, Val Loss: 0.3875
Epoch: 015, Loss: 0.1345, Val Loss: 0.3849
Epoch: 016, Loss: 0.1320, Val Loss: 0.3825
Epoch: 017, Loss: 0.1297, Val Loss: 0.3801
Epoch: 018, Loss: 0.1274, Val Loss: 0.3780
Epoch: 019, Loss: 0.1252, Val Loss: 0.3759
Epoch: 020, Loss: 0.1231, Val Loss: 0.3739
Epoch: 021, Loss: 0.1212, Val Loss: 0.3721
Epoch: 022, Loss: 0.1193, Val Loss: 0.3703
Epoch: 023,

In [93]:
test_link_prediction(model=model_GNN, test_data=test_data.to(device), start=start, to=to)

              precision    recall  f1-score   support

         0.0     0.9165    0.7681    0.8357      2186
         1.0     0.8004    0.9300    0.8603      2186

    accuracy                         0.8490      4372
   macro avg     0.8584    0.8490    0.8480      4372
weighted avg     0.8584    0.8490    0.8480      4372

ROC AUC: 0.8490
Loss: 0.3389


#LightGCN(RecSys)

In [186]:
# def generate_difficult_negatives(pos_edge_label_index, num_ingrs):
#     num_samples = pos_edge_label_index.size(1)
#     generated = torch.randint(0, num_ingrs, (num_samples,))

#     # Ensure generated indices are not the same as positive indices
#     for i in range(num_samples):
#         while generated[i] in pos_edge_label_index[1]:
#             generated[i] = torch.randint(0, num_ingrs, (1,)).item()

#     return generated


In [187]:
def train_lightgcn(dataset, train_loader, model, optimizer, num_ingrs, epochs=1):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)

    # Print dataset attributes for debugging
    print("Dataset attributes:", dataset.__dict__)

    for epoch in range(epochs):
        total_loss, total_examples = 0, 0

        for node_ids in train_loader:
            # Use edge_index instead of edge_label_index if it's not present
            pos_edge_label_index = dataset.edge_index[:, node_ids]
            generated = torch.randint(0, num_ingrs, (node_ids.numel(),)).to(device)

            # Ensure generated indices are within bounds
            generated = torch.clamp(generated, 0, num_ingrs - 1)

            neg_edge_label_index = torch.stack([pos_edge_label_index[0],
                                                generated],
                                               dim=0)

            edge_label_index = torch.cat([pos_edge_label_index, neg_edge_label_index], dim=1)

            # Check if any index in edge_label_index exceeds the bounds
            if edge_label_index.max() >= num_ingrs:
                print(f"Warning: Index out of bounds detected in edge_label_index with max value {edge_label_index.max()}")

            optimizer.zero_grad()

            pos_rank, neg_rank = model(dataset.edge_index.to(device), edge_label_index.to(device)).chunk(2)

            loss = model.recommendation_loss(pos_rank, neg_rank, node_id=edge_label_index.unique())
            loss.backward()
            optimizer.step()

            total_loss += float(loss) * pos_rank.numel()
            total_examples += pos_rank.numel()

            print(f'Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}')



In [188]:
# def train_lightgcn(dataset, train_loader, model, optimizer, num_ingrs, epochs=1):
#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
#     model = model.to(device)

#     for epoch in range(epochs):
#         total_loss, total_examples = 0, 0

#         for node_ids in train_loader:
#             pos_edge_label_index = dataset.edge_index[:, node_ids]

#             # Use the new generate_difficult_negatives function here
#             generated = generate_difficult_negatives(pos_edge_label_index, num_ingrs).to(device)

#             neg_edge_label_index = torch.stack([pos_edge_label_index[0], generated], dim=0)
#             edge_label_index = torch.cat([pos_edge_label_index, neg_edge_label_index], dim=1)

#             optimizer.zero_grad()
#             pos_rank, neg_rank = model(dataset.edge_index.to(device), edge_label_index.to(device)).chunk(2)

#             loss = model.recommendation_loss(pos_rank, neg_rank, node_id=edge_label_index.unique())
#             loss.backward()
#             optimizer.step()

#             total_loss += float(loss) * pos_rank.numel()
#             total_examples += pos_rank.numel()

#         print(f'Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}')

In [189]:
data_copy = data_copy.to_homogeneous()
data_copy.to(device)

Data(edge_index=[2, 10931], x=[8943, 768], node_type=[8943], edge_type=[10931])

In [190]:
train_test_split = RandomLinkSplit(num_val=0.2,
                                           num_test=0.2)
train_data, val_data, test_data = train_test_split(data_copy)

In [191]:
data_loader = DataLoader(range(train_data.edge_index.size(1)),
                             shuffle=True,
                             batch_size=64)

In [192]:
num_nodes = data_copy.num_nodes
num_ingrs = data['ingr'].x.shape[0]

In [193]:
num_ingrs

8943

In [194]:
data_copy

Data(edge_index=[2, 10931], x=[8943, 768], node_type=[8943], edge_type=[10931])

In [195]:
model_lightGCN = LightGCN(num_nodes=num_nodes, embedding_dim=512, num_layers=2)  # Increase from 1 to 3 layers

optimizer = Adam(model_lightGCN.parameters(), lr=0.005)

train_lightgcn(data_copy, data_loader, model_lightGCN, optimizer, num_ingrs, 200)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 151, Loss: 0.0168
Epoch: 151, Loss: 0.0167
Epoch: 151, Loss: 0.0174
Epoch: 151, Loss: 0.0173
Epoch: 151, Loss: 0.0172
Epoch: 151, Loss: 0.0171
Epoch: 151, Loss: 0.0170
Epoch: 151, Loss: 0.0169
Epoch: 151, Loss: 0.0168
Epoch: 151, Loss: 0.0168
Epoch: 151, Loss: 0.0167
Epoch: 151, Loss: 0.0169
Epoch: 151, Loss: 0.0169
Epoch: 151, Loss: 0.0169
Epoch: 151, Loss: 0.0168
Epoch: 151, Loss: 0.0168
Epoch: 151, Loss: 0.0167
Epoch: 151, Loss: 0.0166
Epoch: 151, Loss: 0.0166
Epoch: 151, Loss: 0.0165
Epoch: 151, Loss: 0.0168
Epoch: 151, Loss: 0.0167
Epoch: 151, Loss: 0.0167
Epoch: 151, Loss: 0.0166
Epoch: 151, Loss: 0.0174
Epoch: 151, Loss: 0.0173
Epoch: 151, Loss: 0.0172
Epoch: 151, Loss: 0.0173
Epoch: 151, Loss: 0.0172
Epoch: 151, Loss: 0.0177
Epoch: 151, Loss: 0.0177
Epoch: 151, Loss: 0.0176
Epoch: 151, Loss: 0.0175
Epoch: 151, Loss: 0.0178
Epoch: 151, Loss: 0.0177
Epoch: 151, Loss: 0.0177
Epoch: 151, Loss: 0.0177
Epoch: 151

In [196]:
test_loader = DataLoader(range(test_data.edge_index.size(1)),
                             shuffle=True,
                             batch_size=32)

In [197]:
def test_lightgcn(model, test_loader, num_ingrs):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    model.eval()

    all_true_labels = []
    all_pred_scores = []

    with torch.no_grad():
        for node_ids in test_loader:
            pos_edge_label_index = data_copy.edge_index[:, node_ids]
            generated = torch.randint(num_ingrs, (node_ids.numel(),)).to(device)
            neg_edge_label_index = torch.stack([pos_edge_label_index[0], generated], dim=0)
            edge_label_index = torch.cat([pos_edge_label_index, neg_edge_label_index], dim=1)

            pos_rank, neg_rank = model(data_copy.edge_index, edge_label_index).chunk(2)

            true_labels = torch.cat([torch.ones(pos_rank.size(0)), torch.zeros(neg_rank.size(0))], dim=0)
            pred_scores = torch.cat([pos_rank, neg_rank], dim=0).sigmoid()

            all_true_labels.extend(true_labels.cpu().numpy())
            all_pred_scores.extend(pred_scores.cpu().numpy())

    y_true = all_true_labels
    y_pred = [1 if score >= 0.5 else 0 for score in all_pred_scores]

    print(classification_report(y_true=y_true, y_pred=y_pred, digits=4))

    roc_auc = roc_auc_score(y_true, all_pred_scores)
    print(f'ROC AUC: {roc_auc:.4f}')

In [198]:
test_lightgcn(model_lightGCN, test_loader, num_ingrs)

              precision    recall  f1-score   support

         0.0     0.9262    0.6384    0.7558      8745
         1.0     0.7241    0.9491    0.8215      8745

    accuracy                         0.7938     17490
   macro avg     0.8252    0.7938    0.7887     17490
weighted avg     0.8252    0.7938    0.7887     17490

ROC AUC: 0.9433


In [199]:
data_copy.to(device)

Data(edge_index=[2, 10931], x=[8943, 768], node_type=[8943], edge_type=[10931])

In [224]:
data_copy = data.clone()
data_copy.to(device)

HeteroData(
  ingr={
    x=[8943, 768],
    num_nodes=8943,
  },
  recipe={
    x=[995, 768],
    num_nodes=995,
  },
  (recipe, has_ingr, ingr)={ edge_index=[2, 2393] },
  (ingr, also_known_as, ingr)={ edge_index=[2, 5618] },
  (ingr, has_sub, ingr)={ edge_index=[2, 10931] }
)

In [225]:
del data_copy['recipe']
del data_copy['recipe', 'has_ingr', 'ingr']
del data_copy['ingr', 'also_known_as', 'ingr']

In [202]:
edge_index = data_copy['ingr', 'has_sub', 'ingr'].edge_index
edge_index.to(device)

src_index = torch.tensor([123], dtype=torch.int64)
src_index.to(device)
k = 5

In [203]:
edge_index

tensor([[5851, 6536, 5851,  ..., 7406, 2146, 7406],
        [3014, 7406, 8005,  ..., 8413, 4405, 4385]], device='cuda:0')

In [204]:
src_index

tensor([123])

In [205]:
model = model_lightGCN
model.to(device)
data_copy.to(device)
top_k_recommendations = model.recommend(edge_index=edge_index,
                                        src_index=src_index,
                                        k=k,
                                        sorted=True)

In [206]:
top_k_recommendations

tensor([[2184, 3739, 7752, 2407, 1052]], device='cuda:0')

In [207]:
data_copy['ingr'].x

tensor([[-0.1285, -0.1862,  0.6529,  ...,  0.2621,  0.0449, -0.6869],
        [-0.0331, -0.6405,  0.4848,  ...,  0.4790, -0.0670, -0.6441],
        [-0.1069,  0.0093,  0.4815,  ...,  0.3131, -0.5880, -0.8691],
        ...,
        [ 0.1731, -0.5063,  0.2283,  ..., -0.1311, -0.1492, -1.1333],
        [-0.0842, -0.3332,  0.7529,  ...,  0.3031,  0.1545, -0.1913],
        [ 0.0303, -0.4077,  0.5416,  ...,  0.0311,  0.5649, -0.6668]],
       device='cuda:0')

In [208]:
nodes_df.query("type=='ingr'")

Unnamed: 0,Node,type,index
0,,ingr,0
6620,manila bean,ingr,1
6621,manila clams,ingr,2
6622,manioc,ingr,3
6623,manioc flour,ingr,4
...,...,...,...
4865,elvers,ingr,8938
4864,elk,ingr,8939
3593,buckwheat cereal,ingr,8940
4863,elixer longæ vitæ,ingr,8941


In [209]:
for recommendation in top_k_recommendations.detach().cpu().numpy().squeeze():
  print(nodes_df.query(f"type=='ingr' and index=={recommendation}")['Node'].values)

['hazelnut flour']
['rambutan']
['cockle']
['honey']
['mussels']


In [210]:
print(nodes_df.query(f"type=='ingr' and index=={src_index.item()}")['Node'].values)

['malt sugar']


#Top k recommendations for GNN

In [231]:
hidden_channels = 64
src_node_type = 'ingr'  # Ensure consistency with the node type in your data
dst_node_type = 'ingr'  # Same node type for source and destination

# Ensure that num_nodes is explicitly set for the 'ingr' node type
data_copy['ingr'].num_nodes = data_copy.x_dict['ingr'].size(0)

# Initialize your model
model = Model(hidden_channels=hidden_channels, data=data_copy, start=src_node_type, to=dst_node_type)
model.to(device)
data_copy.to(device)

# Get node embeddings using the encoder
z_dict = model.encoder(data_copy.x_dict, data_copy.edge_index_dict)

# Access the number of nodes in the target node type
num_dst_nodes = data_copy[src_node_type].num_nodes

# Generate edge indices for the node with all possible target nodes (for recommendations)
src_node_id = src_index.item()  # Extract the scalar value if src_index is a tensor
possible_target_nodes = torch.arange(num_dst_nodes, device=device)
edge_label_index = torch.stack([torch.full((possible_target_nodes.size(0),), src_node_id, device=device), possible_target_nodes])

# Score the edges
edge_scores = model.decoder(z_dict, edge_label_index)

# Get the top-k recommendations (highest scoring edges)
top_k_indices = edge_scores.topk(k, largest=True).indices
top_k_recommendations_gnn = edge_label_index[:, top_k_indices]


In [235]:
print(top_k_recommendations_gnn)

tensor([[ 123,  123,  123,  123,  123],
        [2902, 8336, 6076, 7287, 1943]], device='cuda:0')


In [238]:
for recommendation in top_k_recommendations_gnn.detach().cpu().numpy().squeeze()[1]:
  print(nodes_df.query(f"type=='ingr' and index=={recommendation}")['Node'].values)

['sorghum starch']
['beef eye round roast']
['Castelvetrano olives']
['caper berry']
['jasmine tea']


In [241]:
print(nodes_df.query(f"type=='ingr' and index=={top_k_recommendations_gnn.detach().cpu().numpy().squeeze()[0][0]}")['Node'].values)

['malt sugar']
