In [1]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from torch import nn
device = torch.device('cuda')

from recformer.utils import predict_on_batch, measure_accuracy
from recformer.dataset import Dataset, EmbDataset, pad_tensor, Padder
from recformer.train import train, train_multitask
from recformer.recformer import RecFormer, MiltitaskRecFormer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_fwf('train.csv', header=None)
data = df[0].str.split(',', expand=True).values.tolist()
len(data)

23547

In [3]:
from collections import Counter
cusine_counter = Counter()
cusine_vocab = {}

for recipe in data:
  if None in recipe:
    recipe_length = recipe.index(None) - 1
  else:
    recipe_length = len(recipe) - 1
  cusine_counter[recipe[recipe_length]] +=1

#print(cusine_counter)

cusine_num = 0
for k, v in cusine_counter.items():
  if v > 100:
    cusine_vocab[k] = cusine_num
    cusine_num+=1

In [4]:
print(cusine_vocab)

{'greek': 0, 'filipino': 1, 'indian': 2, 'jamaican': 3, 'spanish': 4, 'italian': 5, 'mexican': 6, 'vietnamese': 7, 'thai': 8, 'southern_us': 9, 'chinese': 10, 'cajun_creole': 11, 'brazilian': 12, 'french': 13, 'japanese': 14, 'irish': 15, 'moroccan': 16, 'korean': 17, 'british': 18, 'russian': 19}


Derrick's reading

In [5]:
df = pd.read_csv('train.csv', engine= 'python', sep='\,',  names=list(range(61)))
df1 = df.fillna(0)
df_2 = df1.values.tolist()

In [6]:
# Separating the cuisines from the recipies 
train_ingredients = []
train_labels = []
for i, val in enumerate(df_2):
    R_l = [v for v in val if v !=0]
    train_ingredients.append(list(map(int, R_l[:-1]))) 
    train_labels.append(cusine_vocab[R_l[-1]])

In [7]:
df = pd.read_csv('validation_classification_question.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_x = df1.values.tolist()

df = pd.read_csv('validation_classification_answer.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_y = df1.values.tolist()

In [8]:
val_ingredients_c = []
val_labels_c = []

for i in range(len(val_x)):
  R_l = [v for v in val_x[i] if v !=0]
  val_ingredients_c.append(list(map(int, R_l[:-1]))) 
  val_labels_c.append(cusine_vocab[val_y[i][0]])

print(len(val_ingredients_c), len(val_labels_c))

7848 7848


In [9]:
batch_size = 64
epochs = 32

num_labels = len(cusine_vocab)
num_tokens = 6714
dim_model = 128
num_heads = 4
num_encoder_layers = 3
num_decoder_layers = 1
dropout_p = 0.3

In [11]:
from torch.utils.data import DataLoader

padder = Padder(dim=1, pad_symbol=-1)
train_dataset = Dataset(train_ingredients, train_labels)
validation_dataset = Dataset(val_ingredients_c, val_labels_c)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn = padder, shuffle=True)
validation_loader = DataLoader(dataset=validation_dataset, batch_size=batch_size, collate_fn = padder)

In [12]:
model = RecFormer(num_tokens, num_labels, dim_model, num_heads, num_encoder_layers, num_decoder_layers, dropout_p)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
train(model, criterion, optimizer, train_loader, validation_loader, epochs, device=device)

In [None]:
#torch.save(model.state_dict(), "RecFormer_classification.pth")

In [13]:
model.load_state_dict(torch.load("weights/RecFormer_classification.pth"))
model.to(device)
print(measure_accuracy(model, train_loader))
print(measure_accuracy(model, validation_loader))

tensor(0.9040, device='cuda:0')
tensor(0.7429, device='cuda:0')


# Completion task

In [15]:
completion_data = []
completion_labels = []

for recipe in train_ingredients:
  ingredients_num = len(recipe)
  for i in range(ingredients_num):
    incomplete_recipe = recipe[:ingredients_num].copy()
    missing_ingredient = incomplete_recipe.pop(i)

    completion_data.append(incomplete_recipe)
    completion_labels.append(missing_ingredient)

print(len(completion_data), len(completion_labels))

253453 253453


In [16]:
df = pd.read_csv('validation_completion_question.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_x = df1.values.tolist()

df = pd.read_csv('validation_completion_answer.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_y = df1.values.tolist()

print(len(val_x), len(val_y))

7848 7848


In [17]:
val_ingredients = []
val_labels = []

for i in range(len(val_x)):
  R_l = [v for v in val_x[i] if v !=0]
  val_ingredients.append(list(map(int, R_l[:-1]))) 
  val_labels.append(int(val_y[i][0]))

print(len(val_ingredients), len(val_labels))

7848 7848


In [17]:
from torch.utils.data import DataLoader

padder = Padder(dim=1, pad_symbol=-1)
train_dataset = Dataset(completion_data, completion_labels)
validation_dataset = Dataset(val_ingredients, val_labels)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn = padder, shuffle=True)
validation_loader = DataLoader(dataset=validation_dataset, batch_size=batch_size, collate_fn = padder)

In [18]:
model = RecFormer(num_tokens, num_tokens, dim_model, num_heads, num_encoder_layers, num_decoder_layers, dropout_p)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
train(model, criterion, optimizer, train_loader, validation_loader, epochs)

In [None]:
#torch.save(model.state_dict(), "RecFormer_completion.pth")

In [19]:
model.load_state_dict(torch.load("weights/RecFormer_completion.pth"))
model.to(device)

print(measure_accuracy(model, train_loader))
print(measure_accuracy(model, validation_loader))

tensor(0.1541, device='cuda:0')
tensor(0.1138, device='cuda:0')


# Mutlti-task experiments

In [10]:
multitask_data = []
multitask_labels = []


for recipe, cusine in zip(train_ingredients, train_labels):
  ingredients_num = len(recipe)
  for i in range(ingredients_num):
    incomplete_recipe = recipe[:ingredients_num].copy()
    missing_ingredient = incomplete_recipe.pop(i)

    multitask_data.append(incomplete_recipe)
    multitask_labels.append([cusine, missing_ingredient])
    
print(len(multitask_data), len(multitask_labels))

253453 253453


In [16]:
padder = Padder(dim=1, pad_symbol=-1)
train_dataset = Dataset(multitask_data, multitask_labels)
validation_dataset_cuisine = Dataset(val_ingredients_c, val_labels_c)
validation_dataset_ingredients = Dataset(val_ingredients, val_labels)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn = padder, shuffle=True)
validation_loader_cuisine = DataLoader(dataset=validation_dataset_cuisine, batch_size=batch_size, collate_fn = padder)
validation_loader_ingredients = DataLoader(dataset=validation_dataset_ingredients, batch_size=batch_size, collate_fn = padder)

In [23]:
model = MiltitaskRecFormer(num_tokens, num_labels, dim_model, num_heads, num_encoder_layers, num_decoder_layers, dropout_p)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
train_multitask(model, criterion, optimizer, train_loader, validation_loader_cuisine, validation_loader_ingredients, epochs, loss_weights=[0.5, 2])

In [None]:
#torch.save(model.state_dict(), "RecFormer_multitask.pth")

In [24]:
model.load_state_dict(torch.load("weights/RecFormer_multitask.pth"))
model.to(device)
print(measure_accuracy(model, validation_loader_cuisine, multitask_switch="cuisine"))
print(measure_accuracy(model, validation_loader_ingredients, multitask_switch="ingredients"))

tensor(0.7608, device='cuda:0')
tensor(0.1244, device='cuda:0')


In [25]:
model.eval()
classification_preds = []
with torch.no_grad():
    for batch in validation_loader_cuisine:
        preds = predict_on_batch(model, batch, "cuisine")
        target = batch[1].to(device=device).flatten()
        classification_preds.extend(torch.argmax(preds, axis=1).tolist())
print(len(classification_preds))

7848


In [26]:
id_to_cus = {y: x for x, y in cusine_vocab.items()}

In [27]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix

print(classification_report([id_to_cus[id] for id in val_labels_c], [id_to_cus[id] for id in classification_preds]))

              precision    recall  f1-score   support

   brazilian       0.70      0.61      0.65        85
     british       0.47      0.43      0.45       161
cajun_creole       0.76      0.66      0.71       295
     chinese       0.78      0.85      0.81       516
    filipino       0.71      0.60      0.65       141
      french       0.55      0.62      0.58       538
       greek       0.68      0.65      0.67       222
      indian       0.85      0.88      0.86       624
       irish       0.50      0.53      0.52       122
     italian       0.83      0.84      0.83      1558
    jamaican       0.69      0.61      0.65       113
    japanese       0.80      0.65      0.71       290
      korean       0.78      0.69      0.74       167
     mexican       0.90      0.90      0.90      1273
    moroccan       0.72      0.76      0.74       160
     russian       0.55      0.51      0.53        92
 southern_us       0.69      0.74      0.71       839
     spanish       0.51    

## Word embeddings multi-task experiment

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

In [11]:
import numpy as np

emb_length = 100
glove_vocab = {}
with open('glove.6B/glove.6B.{}d.txt'.format(emb_length), encoding='utf-8') as f:
  for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      glove_vocab[word] = coefs

In [12]:
PAD_embedding = torch.zeros(emb_length)
UNK_embedding = np.mean(list(glove_vocab.values()), axis=0)

In [13]:
df = pd.read_fwf('node_ingredient.csv', header=None)
node_ingredient = df[0].values.tolist()
print(len(node_ingredient))
ing_id_to_str = {i: ing for i,ing in enumerate(node_ingredient)}

6714


In [18]:
padder = Padder(dim=0, pad_symbol=PAD_embedding)

train_dataset = EmbDataset(multitask_data, multitask_labels, glove_vocab, UNK_embedding, ing_id_to_str)
validation_dataset_cuisine = EmbDataset(val_ingredients_c, val_labels_c, glove_vocab, UNK_embedding, ing_id_to_str)
validation_dataset_ingredients = EmbDataset(val_ingredients, val_labels, glove_vocab, UNK_embedding, ing_id_to_str)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn = padder, shuffle=True)
validation_loader_cuisine = DataLoader(dataset=validation_dataset_cuisine, batch_size=batch_size, collate_fn = padder)
validation_loader_ingredients = DataLoader(dataset=validation_dataset_ingredients, batch_size=batch_size, collate_fn = padder)

In [19]:
model = MiltitaskRecFormer(num_tokens, num_labels, 100, num_heads, num_encoder_layers, num_decoder_layers, dropout_p, use_pretrained_embeddings=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
train_multitask(model, criterion, optimizer, train_loader, validation_loader_cuisine, validation_loader_ingredients, epochs, loss_weights=[1,2])

In [21]:
model.to(device)
print(measure_accuracy(model, validation_loader_cuisine, multitask_switch="cuisine"))
print(measure_accuracy(model, validation_loader_ingredients, multitask_switch="ingredients"))

tensor(0.7598, device='cuda:0')
tensor(0.1095, device='cuda:0')
