In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pprint
from sklearn.pipeline import Pipeline
from collections import Counter
from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
import itertools
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score
import pickle

In [2]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from torch import nn
device = torch.device('cuda')

from recformer.utils import predict_on_batch, measure_accuracy
from recformer.dataset import Dataset, EmbDataset, pad_tensor, Padder
from recformer.train import train, train_multitask
from recformer.recformer import RecFormer, MiltitaskRecFormer


  from .autonotebook import tqdm as notebook_tqdm


# Classification

## SVC

In [3]:
LoadedSVC = pickle.load(open('baselines/LinearSVCModel.sav', 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [4]:
def ToRecipeString(RecipeIDs: list):
    # Removes 0s padded to the recipe in document read
    RecipeList = []
    for i, IDs in enumerate(RecipeIDs):
        R_l = [str(int(v)) for v in IDs if v !=0]
        RecipeList.append(R_l) 
    
    # combines IDs in Recipe as one string
    RecipeStrings = []
    for r in RecipeList:
        RecipeStrings.append(' '.join(r))
        
    return RecipeStrings

In [5]:
#ingredient_name = pd.read_csv('node_ingredient.txt', engine='python', delimiter=',,', header=None)
df = pd.read_csv('train.csv', engine= 'python', sep='\,',  names=list(range(60)))

# Validation data
VLabels = pd.read_csv('validation_classification_answer.csv', engine='python', delimiter=',,', header=None)
VRecipe = pd.read_csv('validation_classification_question.csv', engine= 'python', sep='\,',  names=list(range(59)), dtype='float32')

In [6]:
cuisine_vocab = {cuisine: id for id, cuisine in enumerate(np.unique(VLabels))}
#cuisine_vocab = {'greek': 0, 'filipino': 1, 'indian': 2, 'jamaican': 3, 'spanish': 4, 'italian': 5, 'mexican': 6, 'vietnamese': 7, 'thai': 8, 'southern_us': 9, 'chinese': 10, 'cajun_creole': 11, 'brazilian': 12, 'french': 13, 'japanese': 14, 'irish': 15, 'moroccan': 16, 'korean': 17, 'british': 18, 'russian': 19}
id_to_cus = {y: x for x, y in cuisine_vocab.items()}
print(cuisine_vocab)

{'brazilian': 0, 'british': 1, 'cajun_creole': 2, 'chinese': 3, 'filipino': 4, 'french': 5, 'greek': 6, 'indian': 7, 'irish': 8, 'italian': 9, 'jamaican': 10, 'japanese': 11, 'korean': 12, 'mexican': 13, 'moroccan': 14, 'russian': 15, 'southern_us': 16, 'spanish': 17, 'thai': 18, 'vietnamese': 19}


In [7]:
#IngredientList = (np.squeeze(ingredient_name.values)).tolist()
df1 = df.fillna(0)
df_2 = df1.values.tolist()

# validation data
ValReci = VRecipe.fillna(0)
VRecipes = ValReci.values.tolist()
VLabels = (np.squeeze(VLabels.values)).tolist()
label_ids = [cuisine_vocab[label] for label in VLabels]

In [8]:
TrainRecipeList = []
Cuisines = []
for i, val in enumerate(df_2):
    R_l = [v for v in val if v !=0]
    TrainRecipeList.append(R_l[:-1]) 
    Cuisines.append(R_l[-1])

RecipeStrings = ToRecipeString(TrainRecipeList)
ValRecipeStrings = ToRecipeString(VRecipes)
vectorizer_classification = TfidfVectorizer(ngram_range=(1, 1))
matrix_train=vectorizer_classification.fit_transform(RecipeStrings)
matrix_val = vectorizer_classification.transform(ValRecipeStrings)

In [9]:
pred_probs = LoadedSVC.predict_proba(matrix_val)
pred_probs.shape

(7848, 20)

In [10]:
accuracy_score(VLabels, [id_to_cus[id] for id in np.argmax(pred_probs, axis=1)])

0.7804536187563711

## LogReg model

In [11]:
loadedLogR = pickle.load(open('baselines/LogRegressionModel.sav', 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [12]:
pred_probs_lr = loadedLogR.predict_proba(matrix_val)
pred_probs_lr.shape

(7848, 20)

## RecFormer

In [13]:
cuisine_vocab_recformer = {'greek': 0, 'filipino': 1, 'indian': 2, 'jamaican': 3, 'spanish': 4, 'italian': 5, 'mexican': 6, 'vietnamese': 7, 'thai': 8, 'southern_us': 9, 'chinese': 10, 'cajun_creole': 11, 'brazilian': 12, 'french': 13, 'japanese': 14, 'irish': 15, 'moroccan': 16, 'korean': 17, 'british': 18, 'russian': 19}
id_to_cus_recformer = {y: x for x, y in cuisine_vocab_recformer.items()}

In [14]:
df = pd.read_csv('validation_classification_question.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_x = df1.values.tolist()

df = pd.read_csv('validation_classification_answer.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_y = df1.values.tolist()

In [15]:
val_ingredients_c = []
val_labels_c = []

for i in range(len(val_x)):
  R_l = [v for v in val_x[i] if v !=0]
  val_ingredients_c.append(list(map(int, R_l[:-1]))) 
  val_labels_c.append(cuisine_vocab[val_y[i][0]])

print(len(val_ingredients_c), len(val_labels_c))

7848 7848


In [16]:
import numpy as np

emb_length = 300
glove_vocab = {}
with open('glove.6B/glove.6B.{}d.txt'.format(emb_length), encoding='utf-8') as f:
  for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      glove_vocab[word] = coefs

In [17]:
PAD_embedding = torch.zeros(emb_length)
UNK_embedding = np.mean(list(glove_vocab.values()), axis=0)

In [18]:
df = pd.read_fwf('node_ingredient.csv', header=None)
node_ingredient = df[0].values.tolist()
print(len(node_ingredient))
ing_id_to_str = {i: ing for i,ing in enumerate(node_ingredient)}

6714


In [19]:
padder = Padder(dim=0, pad_symbol=PAD_embedding)

validation_dataset_cuisine = EmbDataset(val_ingredients_c, val_labels_c, glove_vocab, UNK_embedding, ing_id_to_str)
validation_loader_cuisine = DataLoader(dataset=validation_dataset_cuisine, batch_size=64, collate_fn = padder)

In [20]:
# padder = Padder(dim=1, pad_symbol=-1)
# validation_dataset_cuisine = Dataset(val_ingredients_c, val_labels_c)
# validation_loader_cuisine = DataLoader(dataset=validation_dataset_cuisine, batch_size=1024, collate_fn = padder)

In [21]:
model = MiltitaskRecFormer(num_tokens=6714, num_labels=20, dim_model=300, num_heads=4, num_encoder_layers=3, num_decoder_layers=1, dropout_p=0.3, use_pretrained_embeddings=True)

In [22]:
model.load_state_dict(torch.load("weights/RecFormer_multitask_emb.pth"))
model.to(device)
print()




In [23]:
model.eval()
pred_probs_recformer = []
with torch.no_grad():
    for batch in validation_loader_cuisine:
        preds = predict_on_batch(model, batch, "cuisine")
        pred_probs_recformer.extend(preds.tolist())
pred_probs_recformer = np.array(pred_probs_recformer)
print(pred_probs_recformer.shape)


(7848, 20)


In [24]:
accuracy_score(VLabels, [id_to_cus_recformer[id] for id in np.argmax(pred_probs_recformer, axis=1)])

0.7663098878695209

In [25]:
dim_order = []
for key, value in cuisine_vocab.items():
    dim_order.append(cuisine_vocab_recformer[key])
print(dim_order)
    

[12, 18, 11, 10, 1, 13, 0, 2, 15, 5, 3, 14, 17, 6, 16, 19, 9, 4, 8, 7]


In [26]:
accuracy_score(VLabels, [id_to_cus[id] for id in np.argmax(pred_probs_recformer[:, dim_order], axis=1)])

0.7663098878695209

## Stacking

In [27]:
w_max = 0
acc_max = 0
for w in np.linspace(0,1,101):
    pred_probs_stack = w * pred_probs_recformer[:, dim_order] + (1-w) * pred_probs
    acc = accuracy_score(VLabels, [id_to_cus[id] for id in np.argmax(pred_probs_stack, axis=1)])
    if acc > acc_max:
        acc_max = acc
        w_max = w
print(acc_max, w_max)

0.7920489296636085 0.31


In [28]:
pred_probs_stack = w_max * pred_probs_recformer[:, dim_order] + (1 - w_max) * pred_probs
accuracy_score(VLabels, [id_to_cus[id] for id in np.argmax(pred_probs_stack, axis=1)])

0.7920489296636085

# Completion

## SVC

In [29]:
loadedSVC_completion = pickle.load(open('baselines/LinearSVCModel_completion.sav', 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [30]:
VCAns = pd.read_csv('validation_completion_answer.csv', engine='python', delimiter=',,', header=None)
VTrain = pd.read_csv('validation_completion_question.csv', engine= 'python', sep='\,',  names=list(range(58)), dtype='float32')

In [31]:
# completion task training data creation
CompData = []
CompLabel = []
for i, inda in enumerate(TrainRecipeList):
    for a in range(len(inda)):
        compy = inda.copy()
        label = compy.pop(a)
        
        CompData.append(compy)
        CompLabel.append(label)
print(len(CompData), len(CompLabel))
TCompStrings = ToRecipeString(CompData)
VTrain = VTrain.fillna(0)
VCompData = VTrain.values.tolist()
VCompStrings = ToRecipeString(VCompData)
def ConvertLabels(LabelIDs):
    # Conversion of IDs to ingredients
    LReci = [str(int(v)) for v in LabelIDs]
    return LReci
VCAnsL = (np.squeeze(VCAns.values)).tolist() 
VCompLabels = ConvertLabels(VCAnsL)
TCompLabels = ConvertLabels(CompLabel)
vectorizer_completion = TfidfVectorizer(ngram_range=(1, 1))

# tokenize and build vocab
matrix_train = vectorizer_completion.fit_transform(TCompStrings)
matrix_val = vectorizer_completion.transform(VCompStrings)

253453 253453


In [32]:
pred_probs_completion_svc = loadedSVC_completion.predict_proba(matrix_val)
pred_probs_completion_svc.shape

(7848, 5858)

In [33]:
accuracy_score(VCompLabels, loadedSVC_completion.classes_[np.argmax(pred_probs_completion_svc, axis=1)])

0.1365953109072375

## RecFormer

In [34]:
svc_map = [int(i) for i in loadedSVC_completion.classes_]

In [35]:
df = pd.read_csv('validation_completion_question.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_x = df1.values.tolist()

df = pd.read_csv('validation_completion_answer.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_y = df1.values.tolist()

print(len(val_x), len(val_y))

7848 7848


In [36]:
val_ingredients = []
val_labels = []

for i in range(len(val_x)):
  R_l = [v for v in val_x[i] if v !=0]
  val_ingredients.append(list(map(int, R_l[:-1]))) 
  val_labels.append(int(val_y[i][0]))

print(len(val_ingredients), len(val_labels))

7848 7848


In [37]:
validation_dataset_ingredients = EmbDataset(val_ingredients, val_labels, glove_vocab, UNK_embedding, ing_id_to_str)
validation_loader_ingredients = DataLoader(dataset=validation_dataset_ingredients, batch_size=64, collate_fn = padder)

In [38]:
# validation_dataset_ingredients = Dataset(val_ingredients, val_labels)
# validation_loader_ingredients = DataLoader(dataset=validation_dataset_ingredients, batch_size=64, collate_fn = padder)

In [39]:
model.eval()
completion_preds = []
with torch.no_grad():
    for batch in validation_loader_ingredients:
        preds = predict_on_batch(model, batch, "ingredients")
        completion_preds.extend(preds.tolist())
completion_preds = np.array(completion_preds)
print(completion_preds.shape)

(7848, 6714)


In [40]:
print(accuracy_score(VCompLabels, loadedSVC_completion.classes_[np.argmax(completion_preds[:,svc_map], axis=1)]))

0.12703873598369012


## Stacking

In [41]:
w_max = 0
acc_max = 0
completion_preds_mapped = completion_preds[:,svc_map]
for w in np.linspace(0,1,101):
    pred_probs_stack = w * completion_preds_mapped + (1-w) * pred_probs_completion_svc
    acc = accuracy_score(VCompLabels, loadedSVC_completion.classes_[np.argmax(pred_probs_stack, axis=1)])
    if acc > acc_max:
        acc_max = acc
        w_max = w
print(acc_max, w_max)

0.14742609582059124 0.2


In [42]:
pred_probs_stack = w_max * completion_preds_mapped + (1-w_max) * pred_probs_completion_svc
accuracy_score(VCompLabels, loadedSVC_completion.classes_[np.argmax(pred_probs_stack, axis=1)])

0.14742609582059124

# Test predictions generation

## Classification

### SVC prediction

In [43]:
test_data_classification = pd.read_csv('test_classification_question.csv', engine='python', sep='\,',  names=list(range(65)), dtype='float32')
test_data_classification = test_data_classification.fillna(0)
test_data_classification = test_data_classification.values.tolist()


test_data_strings_classification = ToRecipeString(test_data_classification)
matrix_test_classification = vectorizer_classification.transform(test_data_strings_classification)

In [44]:
test_probs_svc_classification = LoadedSVC.predict_proba(matrix_test_classification)
test_probs_svc_classification.shape

(3924, 20)

### RecFromer prediction

In [45]:
test_ingredients_classification = []

for i in range(len(test_data_classification)):
  R_l = [v for v in test_data_classification[i] if v !=0]
  test_ingredients_classification.append(list(map(int, R_l[:-1]))) 

print(len(test_ingredients_classification))

3924


In [46]:
test_dataset_classification = EmbDataset(test_ingredients_classification, np.zeros(len(test_ingredients_classification)), glove_vocab, UNK_embedding, ing_id_to_str)
test_loader_classification = DataLoader(dataset=test_dataset_classification, batch_size=64, collate_fn = padder)

In [47]:
# test_dataset_classification = Dataset(test_ingredients_classification, np.zeros(len(test_ingredients_classification)))
# test_loader_classification = DataLoader(dataset=test_dataset_classification, batch_size=1024, collate_fn = padder)

In [48]:
model.eval()
test_probs_recformer_classification = []
with torch.no_grad():
    for batch in test_loader_classification:
        preds = predict_on_batch(model, batch, "cuisine")
        test_probs_recformer_classification.extend(preds.tolist())
test_probs_recformer_classification = np.array(test_probs_recformer_classification)
print(test_probs_recformer_classification.shape)

(3924, 20)


### Stacking

In [49]:
w_cl = 0.31
test_probs_stack_classification = w_cl * test_probs_recformer_classification[:, dim_order] + (1 - w_cl) * test_probs_svc_classification
print(test_probs_stack_classification.shape)

(3924, 20)


In [50]:
test_preds_stack_classification = [id_to_cus[id] for id in np.argmax(test_probs_stack_classification, axis=1)]
len(test_preds_stack_classification)

3924

In [51]:
df = pd.DataFrame(test_preds_stack_classification)
df.to_csv('test_classification_answer.csv', index=False, header=False)

## Completion

### SVC prediction

In [52]:
test_data_completion = pd.read_csv('test_completion_question.csv', engine='python', sep='\,',  names=list(range(35)), dtype='float32')
test_data_completion = test_data_completion.fillna(0)
test_data_completion = test_data_completion.values.tolist()


test_data_strings_completion = ToRecipeString(test_data_completion)
matrix_test_completion = vectorizer_completion.transform(test_data_strings_completion)

In [53]:
test_probs_svc_completion = loadedSVC_completion.predict_proba(matrix_test_completion)
test_probs_svc_completion.shape

(3924, 5858)

### RecFormer prediction

In [54]:
test_ingredients_completion = []

for i in range(len(test_data_completion)):
  R_l = [v for v in test_data_completion[i] if v !=0]
  test_ingredients_completion.append(list(map(int, R_l[:-1]))) 

print(len(test_ingredients_completion))

3924


In [55]:
test_dataset_completion = EmbDataset(test_ingredients_completion, np.zeros(len(test_ingredients_completion)), glove_vocab, UNK_embedding, ing_id_to_str)
test_loader_completion = DataLoader(dataset=test_dataset_completion, batch_size=64, collate_fn = padder)

In [56]:
# test_dataset_completion = Dataset(test_ingredients_completion, np.zeros(len(test_ingredients_completion)))
# test_loader_completion = DataLoader(dataset=test_dataset_completion, batch_size=64, collate_fn = padder)

In [57]:
model.eval()
test_probs_recformer_completion = []
with torch.no_grad():
    for batch in test_loader_completion:
        preds = predict_on_batch(model, batch, "ingredients")
        test_probs_recformer_completion.extend(preds.tolist())
test_probs_recformer_completion = np.array(test_probs_recformer_completion)
print(test_probs_recformer_completion.shape)

(3924, 6714)


### Stacking

In [58]:
w_comp = 0.2
test_probs_stack_completion = w_comp * test_probs_recformer_completion[:,svc_map] + (1-w_comp) * test_probs_svc_completion
print(test_probs_stack_completion.shape)

(3924, 5858)


In [59]:
test_preds_stack_completion = loadedSVC_completion.classes_[np.argmax(test_probs_stack_completion, axis=1)]
len(test_preds_stack_completion)

3924

In [60]:
df = pd.DataFrame(test_preds_stack_completion)
df.to_csv('test_completion_answer.csv', index=False, header=False)