In [1]:
import pandas as pd
import pickle
import numpy as np
import tensorflow.keras as keras
from tensorflow.keras import backend as K
from tqdm import tqdm
import tensorflow as tf
import random
import os
from pathlib import Path
from tqdm import tqdm

In [2]:
def get_average_embeddings(path):
    with open(path,'rb') as f:
        view_data = pickle.load(f)
        
    view1 = view_data[0]
    view2 = view_data[1]
    ids = view_data[2]
    
    return (view1, view2, ids)
    

def get_other_embeddings(path_view1, path_view2):
    with open(path_view1,'rb') as f:
        view1_data = pickle.load(f)
    
    ids = view1_data[2]
    
    with open(path_view2, 'rb') as f:
        view2_data = pickle.load(f)
    
    return view1_data[0], view2_data[0], ids
        

def map_classes(ids_to_keep):
    
    with open('/common/home/aj780/machine_learning/CCA Data/classes1M.pkl','rb') as f:
        
        id2classes = pickle.load(f)
    
    
    id2classes = {k:v for k,v in id2classes.items() if k in ids_to_keep}
    
    classes2id = {}
    for id_, class_ in id2classes.items():
    
        if class_ in classes2id:
            classes2id[class_].append(id_)
        else:
            classes2id[class_] = [id_]
        
    all_classes = list(classes2id.keys())
    
    return id2classes, classes2id, all_classes
    
    
    

In [3]:
def triplet_pairs(embeds_view1, embeds_view2, id2classes, classes2id, all_classes):
    
    anchors = []
    positives = []
    negatives = []
    for id_, anchor in embeds_view1.items():
        class_id = id2classes[id_]
#         positive_id = random.sample(classes2id[class_id], 1)
        
        all_classes.remove(class_id)
        negative_class = random.sample(all_classes, 1)[0]
        all_classes.append(class_id)
        negative_id = random.sample(classes2id[negative_class], 1)[0]
        
        anchors.append(anchor)
        positives.append(embeds_view2[id_])
        negatives.append(embeds_view2[negative_id])
        #apn.append([anchor, embeds_view2[id_], embeds_view2[negative_id]])
    
    return np.array(anchors), np.array(positives), np.array(negatives)

In [4]:
def get_data(view1_path, view2_path):
    if view2_path is None:
        image_temp_data, text_temp_data, ids = get_average_embeddings(view1_path)
    else:
        image_temp_data, text_temp_data, ids = get_other_embeddings(view1_path, view2_path)

    id2classes, classes2id, all_classes = map_classes(set(ids))



    ## creating dictionary of ids to embeddings

    embeds_view1 = {}
    for id_, embed in zip(ids, image_temp_data):
        embeds_view1[id_] = embed

    embeds_view2 = {}
    for id_, embed in zip(ids, text_temp_data):
        embeds_view2[id_] = embed

    image_train_data, text_train_data, text_train_negatives = triplet_pairs(embeds_view1, embeds_view2,id2classes, classes2id, all_classes)
    
    return ids, image_train_data, text_train_data, text_train_negatives
    



In [5]:
def triplet_cca_loss(latent_size, eps):
    def inner_triplet_cca_loss(y_true, y_pred):
        
        ## getting shared embeddings of all 3 pairs
        

        anchor_embeddings = y_pred[:,0:latent_size]
        positive_embeddings = y_pred[:,latent_size:2*latent_size]
        negative_embeddings = y_pred[:,2*latent_size: 3*latent_size]
        
        positive_dist = K.sum(K.square(anchor_embeddings - positive_embeddings), axis=1)
        negative_dist = K.sum(K.square(anchor_embeddings - negative_embeddings), axis=1)
        return K.maximum(positive_dist - negative_dist + eps, 0.)
    return inner_triplet_cca_loss
        
        
        
        
    

In [6]:
def get_base_model(layer_sizes, reg_lambda):
    model = keras.models.Sequential()
    for id_, i in enumerate(layer_sizes):
        if id_ == len(layer_sizes)-1:
            activation = 'linear'
        else:
            activation = 'relu'
        model.add(keras.layers.Dense(i, activation=activation, kernel_regularizer=keras.regularizers.l2(reg_lambda)))
    
    return model

def get_model(layer_sizes, input_size1, input_size2, input_size3, reg_lambda):
    view1_input = keras.layers.Input(shape=(input_size1,))
    view2_input = keras.layers.Input(shape=(input_size2,))
    view3_input = keras.layers.Input(shape=(input_size3,))
    
    base_model = get_base_model(layer_sizes, reg_lambda)
    
    view1 = base_model(view1_input)
    view2 = base_model(view2_input)
    view3 = base_model(view3_input)
    


    merged = keras.layers.Concatenate()([view1, view2, view3])
    model = keras.Model(inputs=[view1_input, view2_input, view3_input], outputs=merged)

    return model

def train_model(output_dir, view1_train_data, view2_train_data, view3_train_data, view1_val_data, view2_val_data, view3_val_data,
                layer_sizes, latent_size, input_size1, input_size2, input_size3, learning_rate, margin, epochs, batch_size, reg_lambda):
    
    
    model = get_model(layer_sizes, input_size1, input_size2, input_size3, reg_lambda)
    sgd = keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9, nesterov=True)
    model.compile(loss=triplet_cca_loss(latent_size, margin), optimizer=sgd)
    
    checkpointer = keras.callbacks.ModelCheckpoint(filepath=output_dir +"/"+ 'model.h5', save_best_only=True, save_weights_only=False)
    
    model.fit(x=[view1_train_data, view2_train_data, view3_train_data],y=np.zeros(len(view1_train_data)), 
              validation_data = ([view1_val_data,view2_val_data,view3_val_data],np.zeros(len(view1_val_data))),
              epochs=epochs, batch_size=batch_size, callbacks = [checkpointer])
    
    return model
    

In [7]:
def predict_model(model, view1_test_data, view2_test_data, view3_test_negatives):
    preds = model.predict([view1_test_data, view2_test_data, view3_test_negatives])
    
    img_l = int(preds.shape[1] / 3)
    text_l = 2*int(preds.shape[1] / 3)

    image_embeds = preds[:,0:img_l]
    text_embeds = preds[:,img_l:text_l]
    
    return (image_embeds, text_embeds)
    
    

In [8]:
def evaluation(data_X, data_Y, data_ids, im2recipe, samples_to_draw, time_sample=10):
    idxs = np.argsort(data_ids)
    names = data_ids[idxs]
    image_vecs = data_X[idxs]
    text_vecs = data_Y[idxs]
    idxs = range(samples_to_draw)
    
    glob_rank = []
    glob_recall = {1:0.0,5:0.0,10:0.0}
    for i in range(time_sample):
        ids = random.sample(range(0,len(names)), samples_to_draw)
        im_sub = image_vecs[ids,:]
        instr_sub = text_vecs[ids,:]
        ids_sub = names[ids]

        if im2recipe:
            sims = np.dot(im_sub,instr_sub.T) # for im2recipe
        else:
            sims = np.dot(instr_sub,im_sub.T) # for recipe2im

        med_rank = []
        recall = {1:0.0,5:0.0,10:0.0}
        for ii in idxs:
            name = ids_sub[ii]
            # get a column of similarities
            sim = sims[ii,:]

            # sort indices in descending order
            sorting = np.argsort(sim)[::-1].tolist()

            # find where the index of the pair sample ended up in the sorting
            pos = sorting.index(ii)

            if (pos+1) == 1:
                recall[1]+=1
            if (pos+1) <=5:
                recall[5]+=1
            if (pos+1)<=10:
                recall[10]+=1

            # store the position
            med_rank.append(pos+1)

        for i in recall.keys():
            recall[i]=recall[i]/samples_to_draw

        med = np.median(med_rank)

        for i in recall.keys():
            glob_recall[i]+=recall[i]
        glob_rank.append(med)

    for i in glob_recall.keys():
        glob_recall[i] = glob_recall[i]/time_sample

    return np.average(glob_rank), glob_recall
    

### Running models on average embeddings

In [9]:
train_ids, image_train_data, text_train_data, text_train_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_train1.pkl', None)

print("Train Data reading done..!")
val_ids, image_val_data, text_val_data, text_val_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_val1.pkl', None)
print("Val Data reading done..!")

test_ids, image_test_data, text_test_data, text_test_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_test1.pkl', None)
print("Test Data reading done..!")




n_components = [1,5, 10, 15, 20,50, 100,200,500, 1000]
i = 0
metric_data_im2recipe = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
metric_data_recipe2im = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
embeddings_type = "averageEmbeddings"
output_dir = "CCA_Final_Part2_Triplet"

embeddings_dir = output_dir + "/" + embeddings_type 


learning_rate = 0.001
margin = 0.4
epochs = 10
batch_size = 512
reg_lambda = 1e-5
for n_comp in n_components:
    
    layer_sizes = [n_comp]
    model_out_dir = embeddings_dir +  "/" + str(n_comp)
    Path(model_out_dir).mkdir(parents=True, exist_ok=True)
    
    
    model = train_model(model_out_dir, image_train_data, text_train_data, text_train_negatives, 
                       image_val_data, text_val_data, text_val_negatives,
                       layer_sizes, n_comp, 1024, 1024, 1024, 
                        learning_rate, margin, epochs, batch_size, reg_lambda)
    
    dataX, dataY = predict_model(model,image_test_data, text_test_data, text_test_negatives)
    medR, recall = evaluation(dataX, dataY, test_ids, True , 1000, 10)
    metric_data_im2recipe.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    medR, recall = evaluation(dataX, dataY, test_ids, False , 1000, 10)
    metric_data_recipe2im.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    
    print("n_comp: " + str(n_comp) + " done..!")
    i+=1
    
metric_data_im2recipe.to_csv(embeddings_dir + "/metrics_" + embeddings_type + "_im2recipe.csv", index=False)
metric_data_recipe2im.to_csv(embeddings_dir + "/metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

Train Data reading done..!
Val Data reading done..!
Test Data reading done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 1 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 5 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 10 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 15 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 20 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 50 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 100 done..!
Epoch 1/10
Ep

### Running model on ingredients embeddings

In [14]:
train_ids, image_train_data, text_train_data, text_train_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_train1.pkl',
                                                                  '/common/home/aj780/machine_learning/CCA Data/ingredients_embeddings_train.pkl')

val_ids, image_val_data, text_val_data, text_val_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_val1.pkl',
                                                                  '/common/home/aj780/machine_learning/CCA Data/ingredients_embeddings_val.pkl')

test_ids, image_test_data, text_test_data, text_test_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_test1.pkl',
                                                                  '/common/home/aj780/machine_learning/CCA Data/ingredients_embeddings_test.pkl')




n_components = [1,5, 10, 15, 20,50, 100,200,500, 1000]
i = 0
metric_data_im2recipe = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
metric_data_recipe2im = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
embeddings_type = "Ingredients"
output_dir = "CCA_Final_Part2_Triplet"

embeddings_dir = output_dir + "/" + embeddings_type 


learning_rate = 0.001
margin = 0.4
epochs = 10
batch_size = 512
reg_lambda = 1e-5
for n_comp in n_components:
    
    layer_sizes = [n_comp]
    model_out_dir = embeddings_dir +  "/" + str(n_comp)
    Path(model_out_dir).mkdir(parents=True, exist_ok=True)
    
    
    model = train_model(model_out_dir, image_train_data, text_train_data, text_train_negatives, 
                       image_val_data, text_val_data, text_val_negatives,
                       layer_sizes, n_comp, 1024, 1024, 1024, 
                        learning_rate, margin, epochs, batch_size, reg_lambda)
    
    dataX, dataY = predict_model(model,image_test_data, text_test_data, text_test_negatives)
    medR, recall = evaluation(dataX, dataY, test_ids, True , 1000, 10)
    metric_data_im2recipe.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    medR, recall = evaluation(dataX, dataY, test_ids, False , 1000, 10)
    metric_data_recipe2im.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    
    print("n_comp: " + str(n_comp) + " done..!")
    i+=1
    
metric_data_im2recipe.to_csv(embeddings_dir + "/metrics_" + embeddings_type + "_im2recipe.csv", index=False)
metric_data_recipe2im.to_csv(embeddings_dir + "/metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 1 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 5 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 10 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 15 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 20 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 50 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 100 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epo

### Running model on instructions embeddings

In [15]:
train_ids, image_train_data, text_train_data, text_train_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_train1.pkl',
                                                                  '/common/home/aj780/machine_learning/CCA Data/instructions_embeddings_train.pkl')

val_ids, image_val_data, text_val_data, text_val_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_val1.pkl',
                                                                  '/common/home/aj780/machine_learning/CCA Data/instructions_embeddings_val.pkl')

test_ids, image_test_data, text_test_data, text_test_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_test1.pkl',
                                                                  '/common/home/aj780/machine_learning/CCA Data/instructions_embeddings_test.pkl')






n_components = [1,5, 10, 15, 20,50, 100,200,500, 1000]
i = 0
metric_data_im2recipe = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
metric_data_recipe2im = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
embeddings_type = "Instructions"
output_dir = "CCA_Final_Part2_Triplet"

embeddings_dir = output_dir + "/" + embeddings_type 


learning_rate = 0.001
margin = 0.4
epochs = 10
batch_size = 512
reg_lambda = 1e-5
for n_comp in n_components:
    
    layer_sizes = [n_comp]
    model_out_dir = embeddings_dir +  "/" + str(n_comp)
    Path(model_out_dir).mkdir(parents=True, exist_ok=True)
    
    
    model = train_model(model_out_dir, image_train_data, text_train_data, text_train_negatives, 
                       image_val_data, text_val_data, text_val_negatives,
                       layer_sizes, n_comp, 1024, 1024, 1024, 
                        learning_rate, margin, epochs, batch_size, reg_lambda)
    
    dataX, dataY = predict_model(model,image_test_data, text_test_data, text_test_negatives)
    medR, recall = evaluation(dataX, dataY, test_ids, True , 1000, 10)
    metric_data_im2recipe.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    medR, recall = evaluation(dataX, dataY, test_ids, False , 1000, 10)
    metric_data_recipe2im.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    
    print("n_comp: " + str(n_comp) + " done..!")
    i+=1
    
metric_data_im2recipe.to_csv(embeddings_dir + "/metrics_" + embeddings_type + "_im2recipe.csv", index=False)
metric_data_recipe2im.to_csv(embeddings_dir + "/metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 1 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 5 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 10 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 15 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 20 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 50 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 100 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epo

### Running model on title embeddings

In [16]:
train_ids, image_train_data, text_train_data, text_train_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_train1.pkl',
                                                                  '/common/home/aj780/machine_learning/CCA Data/title_embeddings_train.pkl')

val_ids, image_val_data, text_val_data, text_val_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_val1.pkl',
                                                                  '/common/home/aj780/machine_learning/CCA Data/title_embeddings_val.pkl')

test_ids, image_test_data, text_test_data, text_test_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_test1.pkl',
                                                                  '/common/home/aj780/machine_learning/CCA Data/title_embeddings_test.pkl')





n_components = [1,5, 10, 15, 20,50, 100,200,500, 1000]
i = 0
metric_data_im2recipe = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
metric_data_recipe2im = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
embeddings_type = "Title"
output_dir = "CCA_Final_Part2_Triplet"

embeddings_dir = output_dir + "/" + embeddings_type 


learning_rate = 0.001
margin = 0.4
epochs = 10
batch_size = 512
reg_lambda = 1e-5
for n_comp in n_components:
    
    layer_sizes = [n_comp]
    model_out_dir = embeddings_dir +  "/" + str(n_comp)
    Path(model_out_dir).mkdir(parents=True, exist_ok=True)
    
    
    model = train_model(model_out_dir, image_train_data, text_train_data, text_train_negatives, 
                       image_val_data, text_val_data, text_val_negatives,
                       layer_sizes, n_comp, 1024, 1024, 1024, 
                        learning_rate, margin, epochs, batch_size, reg_lambda)
    
    dataX, dataY = predict_model(model,image_test_data, text_test_data, text_test_negatives)
    medR, recall = evaluation(dataX, dataY, test_ids, True , 1000, 10)
    metric_data_im2recipe.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    medR, recall = evaluation(dataX, dataY, test_ids, False , 1000, 10)
    metric_data_recipe2im.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    
    print("n_comp: " + str(n_comp) + " done..!")
    i+=1
    
metric_data_im2recipe.to_csv(embeddings_dir + "/metrics_" + embeddings_type + "_im2recipe.csv", index=False)
metric_data_recipe2im.to_csv(embeddings_dir + "/metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 1 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 5 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 10 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 15 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 20 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 50 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
n_comp: 100 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epo

### Running ablation studies on hyperparameters - learning rate

In [12]:
train_ids, image_train_data, text_train_data, text_train_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_train1.pkl', None)

print("Train Data reading done..!")
val_ids, image_val_data, text_val_data, text_val_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_val1.pkl', None)
print("Val Data reading done..!")

test_ids, image_test_data, text_test_data, text_test_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_test1.pkl', None)
print("Test Data reading done..!")




learning_rates = [0.1, 0.01, 0.05, 0.001,0.005, 0.0001, 0.0005]
i = 0
metric_data_im2recipe = pd.DataFrame(columns = ['learning_rate','medR','r@1', 'r@5','r@10'], index=range(len(learning_rates)))
metric_data_recipe2im = pd.DataFrame(columns = ['learning_rate','medR','r@1', 'r@5','r@10'], index=range(len(learning_rates)))
embeddings_type = "learningrate"
output_dir = "CCA_Final_Part2_Triplet"

embeddings_dir = output_dir + "/" + embeddings_type 


n_component = 200
margin = 0.4
epochs = 10
batch_size = 512
reg_lambda = 1e-5
layer_sizes = [n_component]
for lr in learning_rates:
    
    model_out_dir = embeddings_dir +  "/" + str(lr)
    Path(model_out_dir).mkdir(parents=True, exist_ok=True)
    
    
    model = train_model(model_out_dir, image_train_data, text_train_data, text_train_negatives, 
                       image_val_data, text_val_data, text_val_negatives,
                       layer_sizes, n_component, 1024, 1024, 1024, 
                        lr, margin, epochs, batch_size, reg_lambda)
    
    dataX, dataY = predict_model(model,image_test_data, text_test_data, text_test_negatives)
    medR, recall = evaluation(dataX, dataY, test_ids, True , 1000, 10)
    metric_data_im2recipe.loc[i] = [lr, medR, recall[1], recall[5], recall[10]]
    
    medR, recall = evaluation(dataX, dataY, test_ids, False , 1000, 10)
    metric_data_recipe2im.loc[i] = [lr, medR, recall[1], recall[5], recall[10]]
    
    
    print("learning rate: " + str(lr) + " done..!")
    i+=1
    
metric_data_im2recipe.to_csv(embeddings_dir + "/lr_metrics_" + embeddings_type + "_im2recipe.csv", index=False)
metric_data_recipe2im.to_csv(embeddings_dir + "/lr_metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

Train Data reading done..!
Val Data reading done..!
Test Data reading done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
learning rate: 0.1 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
learning rate: 0.01 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
learning rate: 0.05 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
learning rate: 0.001 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
learning rate: 0.005 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
learning rate: 0.0001 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/

### Running ablation studies on hyperparameters - Number of layers

In [14]:
train_ids, image_train_data, text_train_data, text_train_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_train1.pkl', None)

print("Train Data reading done..!")
val_ids, image_val_data, text_val_data, text_val_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_val1.pkl', None)
print("Val Data reading done..!")

test_ids, image_test_data, text_test_data, text_test_negatives = get_data('/common/home/aj780/machine_learning/CCA Data/embeddings_test1.pkl', None)
print("Test Data reading done..!")




layer_sizes = [[],[512], [512, 256], [512, 256, 128], [512, 256, 128, 64]]
i = 0
metric_data_im2recipe = pd.DataFrame(columns = ['number of layers','medR','r@1', 'r@5','r@10'], index=range(len(layer_sizes)))
metric_data_recipe2im = pd.DataFrame(columns = ['number of layers','medR','r@1', 'r@5','r@10'], index=range(len(layer_sizes)))
embeddings_type = "num_layers"
output_dir = "CCA_Final_Part2_Triplet"

embeddings_dir = output_dir + "/" + embeddings_type 


learning_rate = 0.001
n_component = 200
margin = 0.4
epochs = 10
batch_size = 512
reg_lambda = 1e-5
for ls in layer_sizes:
    
    ls.append(n_component)
    model_out_dir = embeddings_dir +  "/" + str(i+1)
    Path(model_out_dir).mkdir(parents=True, exist_ok=True)
    
    
    model = train_model(model_out_dir, image_train_data, text_train_data, text_train_negatives, 
                       image_val_data, text_val_data, text_val_negatives,
                       ls, n_component, 1024, 1024, 1024, 
                        learning_rate, margin, epochs, batch_size, reg_lambda)
    
    dataX, dataY = predict_model(model,image_test_data, text_test_data, text_test_negatives)
    medR, recall = evaluation(dataX, dataY, test_ids, True , 1000, 10)
    metric_data_im2recipe.loc[i] = [i+1, medR, recall[1], recall[5], recall[10]]
    
    medR, recall = evaluation(dataX, dataY, test_ids, False , 1000, 10)
    metric_data_recipe2im.loc[i] = [i+1, medR, recall[1], recall[5], recall[10]]
    
    
    print("number of layers: " + str(i) + " done..!")
    i+=1
    
metric_data_im2recipe.to_csv(embeddings_dir + "/numlayers_metrics_" + embeddings_type + "_im2recipe.csv", index=False)
metric_data_recipe2im.to_csv(embeddings_dir + "/numlayers_metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

Train Data reading done..!
Val Data reading done..!
Test Data reading done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of layers: 0 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of layers: 1 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of layers: 2 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of layers: 3 done..!
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
number of layers: 4 done..!
