In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pprint
from sklearn.pipeline import Pipeline
from collections import Counter
from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
import itertools
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score
import pickle

In [3]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from torch import nn
device = torch.device('cuda')

from utils import predict_on_batch, measure_accuracy
from dataset import Dataset, EmbDataset, pad_tensor, Padder
from train import train, train_multitask
from recformer import RecFormer, MiltitaskRecFormer


In [4]:
LoadedSVC = pickle.load(open('LinearSVCModel.sav', 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [5]:
def ToRecipeString(RecipeIDs: list):
    # Removes 0s padded to the recipe in document read
    RecipeList = []
    for i, IDs in enumerate(RecipeIDs):
        R_l = [str(int(v)) for v in IDs if v !=0]
        RecipeList.append(R_l) 
    
    # combines IDs in Recipe as one string
    RecipeStrings = []
    for r in RecipeList:
        RecipeStrings.append(' '.join(r))
        
    return RecipeStrings

In [6]:
#ingredient_name = pd.read_csv('node_ingredient.txt', engine='python', delimiter=',,', header=None)
df = pd.read_csv('train.csv', engine= 'python', sep='\,',  names=list(range(60)))

# Validation data
VLabels = pd.read_csv('validation_classification_answer.csv', engine='python', delimiter=',,', header=None)
VRecipe = pd.read_csv('validation_classification_question.csv', engine= 'python', sep='\,',  names=list(range(59)), dtype='float32')

In [7]:
cuisine_vocab = {cuisine: id for id, cuisine in enumerate(np.unique(VLabels))}
#cuisine_vocab = {'greek': 0, 'filipino': 1, 'indian': 2, 'jamaican': 3, 'spanish': 4, 'italian': 5, 'mexican': 6, 'vietnamese': 7, 'thai': 8, 'southern_us': 9, 'chinese': 10, 'cajun_creole': 11, 'brazilian': 12, 'french': 13, 'japanese': 14, 'irish': 15, 'moroccan': 16, 'korean': 17, 'british': 18, 'russian': 19}
id_to_cus = {y: x for x, y in cuisine_vocab.items()}
print(cuisine_vocab)

{'brazilian': 0, 'british': 1, 'cajun_creole': 2, 'chinese': 3, 'filipino': 4, 'french': 5, 'greek': 6, 'indian': 7, 'irish': 8, 'italian': 9, 'jamaican': 10, 'japanese': 11, 'korean': 12, 'mexican': 13, 'moroccan': 14, 'russian': 15, 'southern_us': 16, 'spanish': 17, 'thai': 18, 'vietnamese': 19}


In [8]:
#IngredientList = (np.squeeze(ingredient_name.values)).tolist()
df1 = df.fillna(0)
df_2 = df1.values.tolist()

# validation data
ValReci = VRecipe.fillna(0)
VRecipes = ValReci.values.tolist()
VLabels = (np.squeeze(VLabels.values)).tolist()
label_ids = [cuisine_vocab[label] for label in VLabels]

In [9]:
TrainRecipeList = []
Cuisines = []
for i, val in enumerate(df_2):
    R_l = [v for v in val if v !=0]
    TrainRecipeList.append(R_l[:-1]) 
    Cuisines.append(R_l[-1])

RecipeStrings = ToRecipeString(TrainRecipeList)
ValRecipeStrings = ToRecipeString(VRecipes)
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
matrix_train=vectorizer.fit_transform(RecipeStrings)
matrix_val = vectorizer.transform(ValRecipeStrings)

In [10]:
pred_probs = LoadedSVC.predict_proba(matrix_val)
pred_probs.shape

(7848, 20)

In [11]:
accuracy_score(VLabels, [id_to_cus[id] for id in np.argmax(pred_probs, axis=1)])

0.7804536187563711

# LogReg model

In [12]:
loadedLogR = pickle.load(open('LogRegressionModel.sav', 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [13]:
pred_probs_lr = loadedLogR.predict_proba(matrix_val)
pred_probs_lr.shape

(7848, 20)

# RecFormer

In [14]:
cuisine_vocab_recformer = {'greek': 0, 'filipino': 1, 'indian': 2, 'jamaican': 3, 'spanish': 4, 'italian': 5, 'mexican': 6, 'vietnamese': 7, 'thai': 8, 'southern_us': 9, 'chinese': 10, 'cajun_creole': 11, 'brazilian': 12, 'french': 13, 'japanese': 14, 'irish': 15, 'moroccan': 16, 'korean': 17, 'british': 18, 'russian': 19}
id_to_cus_recformer = {y: x for x, y in cuisine_vocab_recformer.items()}

In [15]:
df = pd.read_csv('validation_classification_question.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_x = df1.values.tolist()

df = pd.read_csv('validation_classification_answer.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_y = df1.values.tolist()

In [16]:
val_ingredients_c = []
val_labels_c = []

for i in range(len(val_x)):
  R_l = [v for v in val_x[i] if v !=0]
  val_ingredients_c.append(list(map(int, R_l[:-1]))) 
  val_labels_c.append(cuisine_vocab[val_y[i][0]])

print(len(val_ingredients_c), len(val_labels_c))

7848 7848


In [17]:
padder = Padder(dim=1, pad_symbol=-1)
validation_dataset_cuisine = Dataset(val_ingredients_c, val_labels_c)
validation_loader_cuisine = DataLoader(dataset=validation_dataset_cuisine, batch_size=1024, collate_fn = padder)

In [18]:
model = MiltitaskRecFormer(num_tokens=6714, num_labels=20, dim_model=128, num_heads=4, num_encoder_layers=3, num_decoder_layers=1, dropout_p=0.3)

In [19]:
model.load_state_dict(torch.load("weights/RecFormer_multitask.pth"))
model.to(device)
print()




In [20]:
model.eval()
pred_probs_recformer = []
with torch.no_grad():
    for batch in validation_loader_cuisine:
        preds = predict_on_batch(model, batch, "cuisine")
        pred_probs_recformer.extend(preds.tolist())
pred_probs_recformer = np.array(pred_probs_recformer)
print(pred_probs_recformer.shape)


(7848, 20)


In [21]:
accuracy_score(VLabels, [id_to_cus_recformer[id] for id in np.argmax(pred_probs_recformer, axis=1)])

0.761085626911315

In [22]:
dim_order = []
for key, value in cuisine_vocab.items():
    dim_order.append(cuisine_vocab_recformer[key])
print(dim_order)
    

[12, 18, 11, 10, 1, 13, 0, 2, 15, 5, 3, 14, 17, 6, 16, 19, 9, 4, 8, 7]


In [23]:
accuracy_score(VLabels, [id_to_cus[id] for id in np.argmax(pred_probs_recformer[:, dim_order], axis=1)])

0.761085626911315

In [24]:
w_max = 0
acc_max = 0
for w in np.linspace(0,1,101):
    pred_probs_stack = w * pred_probs_recformer[:, dim_order] + (1-w) * pred_probs
    acc = accuracy_score(VLabels, [id_to_cus[id] for id in np.argmax(pred_probs_stack, axis=1)])
    if acc > acc_max:
        acc_max = acc
        w_max = w
print(acc_max, w_max)

0.7872069317023446 0.39


In [25]:
pred_probs_stack = w_max * pred_probs_recformer[:, dim_order] + (1 - w_max) * pred_probs
accuracy_score(VLabels, [id_to_cus[id] for id in np.argmax(pred_probs_stack, axis=1)])

0.7872069317023446

# Completion

In [26]:
loadedSVC_completion = pickle.load(open('LinearSVCModel_completion.sav', 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [27]:
VCAns = pd.read_csv('validation_completion_answer.csv', engine='python', delimiter=',,', header=None)
VTrain = pd.read_csv('validation_completion_question.csv', engine= 'python', sep='\,',  names=list(range(58)), dtype='float32')

In [28]:
# completion task training data creation
CompData = []
CompLabel = []
for i, inda in enumerate(TrainRecipeList):
    for a in range(len(inda)):
        compy = inda.copy()
        label = compy.pop(a)
        
        CompData.append(compy)
        CompLabel.append(label)
print(len(CompData), len(CompLabel))
TCompStrings = ToRecipeString(CompData)
VTrain = VTrain.fillna(0)
VCompData = VTrain.values.tolist()
VCompStrings = ToRecipeString(VCompData)
def ConvertLabels(LabelIDs):
    # Conversion of IDs to ingredients
    LReci = [str(int(v)) for v in LabelIDs]
    return LReci
VCAnsL = (np.squeeze(VCAns.values)).tolist() 
VCompLabels = ConvertLabels(VCAnsL)
TCompLabels = ConvertLabels(CompLabel)
vectorizer = TfidfVectorizer(ngram_range=(1, 1))

# tokenize and build vocab
matrix_train = vectorizer.fit_transform(TCompStrings)
matrix_val = vectorizer.transform(VCompStrings)

253453 253453


In [29]:
pred_probs_completion_svc = loadedSVC_completion.predict_proba(matrix_val)
pred_probs_completion_svc.shape

(7848, 5858)

In [30]:
accuracy_score(VCompLabels, loadedSVC_completion.classes_[np.argmax(pred_probs_completion_svc, axis=1)])

0.1365953109072375

## RecFormer

In [32]:
svc_map = [int(i) for i in loadedSVC_completion.classes_]

In [34]:
df = pd.read_csv('validation_completion_question.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_x = df1.values.tolist()

df = pd.read_csv('validation_completion_answer.csv', engine= 'python', sep='\,',  names=list(range(60)))
df1 = df.fillna(0)
val_y = df1.values.tolist()

print(len(val_x), len(val_y))

7848 7848


In [35]:
val_ingredients = []
val_labels = []

for i in range(len(val_x)):
  R_l = [v for v in val_x[i] if v !=0]
  val_ingredients.append(list(map(int, R_l[:-1]))) 
  val_labels.append(int(val_y[i][0]))

print(len(val_ingredients), len(val_labels))

7848 7848


In [36]:
validation_dataset_ingredients = Dataset(val_ingredients, val_labels)
validation_loader_ingredients = DataLoader(dataset=validation_dataset_ingredients, batch_size=64, collate_fn = padder)

In [37]:
model.eval()
completion_preds = []
with torch.no_grad():
    for batch in validation_loader_ingredients:
        preds = predict_on_batch(model, batch, "ingredients")
        completion_preds.extend(preds.tolist())
completion_preds = np.array(completion_preds)
print(completion_preds.shape)

(7848, 6714)


In [39]:
print(accuracy_score(VCompLabels, loadedSVC_completion.classes_[np.argmax(completion_preds[:,svc_map], axis=1)]))

0.12436289500509684


In [40]:
w_max = 0
acc_max = 0
completion_preds_mapped = completion_preds[:,svc_map]
for w in np.linspace(0,1,101):
    pred_probs_stack = w * completion_preds_mapped + (1-w) * pred_probs_completion_svc
    acc = accuracy_score(VCompLabels, loadedSVC_completion.classes_[np.argmax(pred_probs_stack, axis=1)])
    if acc > acc_max:
        acc_max = acc
        w_max = w
print(acc_max, w_max)

0.14475025484199797 0.26


In [41]:
pred_probs_stack = w_max * completion_preds_mapped + (1-w_max) * pred_probs_completion_svc
accuracy_score(VCompLabels, loadedSVC_completion.classes_[np.argmax(pred_probs_stack, axis=1)])

0.14475025484199797

# Test predictions generation