In [2]:
import pandas as pd
import numpy as np
import copy
import pickle
from cca_zoo.models import CCA
from sklearn.preprocessing import normalize
import random

In [4]:
def train(data_X, data_Y, n_components, output_dir, embeddings_type= "ingredients"):
    
    model = CCA(latent_dims=n_components)
    model.fit([data_X, data_Y])
    
    model_file = output_dir + "_" + embeddings_type + "_" + str(n_components)
    with open(model_file,'wb') as f:
        pickle.dump(model,f)
        
    return model

In [5]:
def evaluation(model, data_X, data_Y, data_ids, im2recipe, samples_to_draw, time_sample=10):
    idxs = np.argsort(data_ids)
    names = data_ids[idxs]
    image_vecs = data_X[idxs]
    text_vecs = data_Y[idxs]
    idxs = range(samples_to_draw)
    
    glob_rank = []
    glob_recall = {1:0.0,5:0.0,10:0.0}
    for i in range(time_sample):
        ids = random.sample(range(0,len(names)), samples_to_draw)
        im_sub = image_vecs[ids,:]
        instr_sub = text_vecs[ids,:]
        ids_sub = names[ids]

        if im2recipe:
            sims = np.dot(im_sub,instr_sub.T) # for im2recipe
        else:
            sims = np.dot(instr_sub,im_sub.T) # for recipe2im

        med_rank = []
        recall = {1:0.0,5:0.0,10:0.0}
        for ii in idxs:
            name = ids_sub[ii]
            # get a column of similarities
            sim = sims[ii,:]

            # sort indices in descending order
            sorting = np.argsort(sim)[::-1].tolist()

            # find where the index of the pair sample ended up in the sorting
            pos = sorting.index(ii)

            if (pos+1) == 1:
                recall[1]+=1
            if (pos+1) <=5:
                recall[5]+=1
            if (pos+1)<=10:
                recall[10]+=1

            # store the position
            med_rank.append(pos+1)

        for i in recall.keys():
            recall[i]=recall[i]/samples_to_draw

        med = np.median(med_rank)

        for i in recall.keys():
            glob_recall[i]+=recall[i]
        glob_rank.append(med)

    for i in glob_recall.keys():
        glob_recall[i] = glob_recall[i]/time_sample

    return np.average(glob_rank), glob_recall
    

In [4]:
with open("CCA Data/embeddings_train1.pkl",'rb') as f:
    image_data = pickle.load(f)

image_train_data = image_data[0]
ingre_train_data = image_data[1]


with open("CCA Data/embeddings_test1.pkl",'rb') as f:
    image_data = pickle.load(f)

test_ids = image_data[2]
image_test_data = image_data[0]
ingre_test_data = image_data[1]


n_components = [1,5, 10, 15, 20,50, 100,200,500, 1000]
i = 0
metric_data = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
embeddings_type = "averageEmbeddings"
im2recipe = True
output_dir = "CCA_Final_Data/"
for n_comp in n_components:
    model = train(image_train_data, ingre_train_data, n_comp, output_dir, embeddings_type=embeddings_type)
    
    dataX, dataY = model.transform([image_test_data, ingre_test_data])
    medR, recall = evaluation(model, dataX, dataY, test_ids, im2recipe , 1000, 10)
    metric_data.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    print("n_comp: " + str(n_comp) + " done..!")
    i+=1
    
if im2recipe:
    metric_data.to_csv(output_dir + "metrics_" + embeddings_type + "_im2recipe.csv", index=False)

else:
    metric_data.to_csv(output_dir + "metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

n_comp: 1 done..!
n_comp: 5 done..!
n_comp: 10 done..!
n_comp: 15 done..!
n_comp: 20 done..!
n_comp: 50 done..!
n_comp: 100 done..!
n_comp: 200 done..!
n_comp: 500 done..!
n_comp: 1000 done..!


In [5]:
with open("CCA Data/embeddings_train1.pkl",'rb') as f:
    image_data = pickle.load(f)

image_train_data = image_data[0]
ingre_train_data = image_data[1]


with open("CCA Data/embeddings_test1.pkl",'rb') as f:
    image_data = pickle.load(f)

test_ids = image_data[2]
image_test_data = image_data[0]
ingre_test_data = image_data[1]


n_components = [1,5, 10, 15, 20,50, 100,200,500, 1000]
i = 0
metric_data = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
embeddings_type = "averageEmbeddings"
im2recipe = False
output_dir = "CCA_Final_Data_recipe2im/"
for n_comp in n_components:
    model = train(image_train_data, ingre_train_data, n_comp, output_dir, embeddings_type=embeddings_type)
    
    dataX, dataY = model.transform([image_test_data, ingre_test_data])
    medR, recall = evaluation(model, dataX, dataY, test_ids, im2recipe , 1000, 10)
    metric_data.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    print("n_comp: " + str(n_comp) + " done..!")
    i+=1
    
if im2recipe:
    metric_data.to_csv(output_dir + "metrics_" + embeddings_type + "_im2recipe.csv", index=False)

else:
    metric_data.to_csv(output_dir + "metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

n_comp: 1 done..!
n_comp: 5 done..!
n_comp: 10 done..!
n_comp: 15 done..!
n_comp: 20 done..!
n_comp: 50 done..!
n_comp: 100 done..!
n_comp: 200 done..!
n_comp: 500 done..!
n_comp: 1000 done..!


In [4]:
# with open("CCA Data/embeddings_train1.pkl",'rb') as f:
#     image_data = pickle.load(f)
    
# with open("CCA Data/ingredients_embeddings_train.pkl",'rb') as f:
#     ingre_data = pickle.load(f)

# image_train_data = image_data[0]
# ingre_train_data = ingre_data[0]


# with open("CCA Data/embeddings_test1.pkl",'rb') as f:
#     image_data = pickle.load(f)
    
# with open("CCA Data/ingredients_embeddings_test.pkl",'rb') as f:
#     ingre_data = pickle.load(f)

# test_ids = image_data[2]
# image_test_data = image_data[0]
# ingre_test_data = ingre_data[0]

In [5]:
# n_components = [1,5, 10, 15, 20,50, 100,200,500, 1000]
# i = 0
# metric_data = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
# embeddings_type = "ingredients"
# im2recipe = False
# output_dir = "CCA_Final_Data_recipe2im/"
# for n_comp in n_components:
#     model = train(image_train_data, ingre_train_data, n_comp, output_dir, embeddings_type=embeddings_type)
    
#     dataX, dataY = model.transform([image_test_data, ingre_test_data])
#     medR, recall = evaluation(model, dataX, dataY, test_ids, im2recipe , 1000, 10)
#     metric_data.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
#     print("n_comp: " + str(n_comp) + " done..!")
#     i+=1
    
# if im2recipe:
#     metric_data.to_csv(output_dir + "metrics_" + embeddings_type + "_im2recipe.csv", index=False)

# else:
#     metric_data.to_csv(output_dir + "metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

n_comp: 1 done..!
n_comp: 5 done..!
n_comp: 10 done..!
n_comp: 15 done..!
n_comp: 20 done..!
n_comp: 50 done..!
n_comp: 100 done..!
n_comp: 200 done..!
n_comp: 500 done..!
n_comp: 1000 done..!


In [6]:
metric_data

Unnamed: 0,n_comp,medR,r@1,r@5,r@10
0,1,264.7,0.0019,0.0097,0.0194
1,5,65.35,0.0127,0.0583,0.1103
2,10,27.4,0.0432,0.1659,0.2804
3,15,14.55,0.0906,0.2959,0.4251
4,20,9.75,0.1355,0.3718,0.5194
5,50,5.0,0.2336,0.5205,0.6597
6,100,3.5,0.2949,0.5886,0.706
7,200,3.0,0.3375,0.6237,0.7164
8,500,2.8,0.3732,0.6095,0.6781
9,1000,3.75,0.3438,0.5439,0.6025


In [7]:
with open("CCA Data/embeddings_train1.pkl",'rb') as f:
    image_data = pickle.load(f)
    
with open("CCA Data/instructions_embeddings_train.pkl",'rb') as f:
    ingre_data = pickle.load(f)

image_train_data = image_data[0]
ingre_train_data = ingre_data[0]


with open("CCA Data/embeddings_test1.pkl",'rb') as f:
    image_data = pickle.load(f)
    
with open("CCA Data/instructions_embeddings_test.pkl",'rb') as f:
    ingre_data = pickle.load(f)

test_ids = image_data[2]
image_test_data = image_data[0]
ingre_test_data = ingre_data[0]


n_components = [1,5, 10, 15, 20,50, 100,200,500, 1000]
i = 0
metric_data = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
embeddings_type = "instructions"
im2recipe = False
output_dir = "CCA_Final_Data_recipe2im/"
for n_comp in n_components:
    model = train(image_train_data, ingre_train_data, n_comp, output_dir, embeddings_type=embeddings_type)
    
    dataX, dataY = model.transform([image_test_data, ingre_test_data])
    medR, recall = evaluation(model, dataX, dataY, test_ids, im2recipe , 1000, 10)
    metric_data.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    print("n_comp: " + str(n_comp) + " done..!")
    i+=1
    
if im2recipe:
    metric_data.to_csv(output_dir + "metrics_" + embeddings_type + "_im2recipe.csv", index=False)

else:
    metric_data.to_csv(output_dir + "metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

n_comp: 1 done..!
n_comp: 5 done..!
n_comp: 10 done..!
n_comp: 15 done..!
n_comp: 20 done..!
n_comp: 50 done..!
n_comp: 100 done..!
n_comp: 200 done..!
n_comp: 500 done..!
n_comp: 1000 done..!


In [8]:
metric_data

Unnamed: 0,n_comp,medR,r@1,r@5,r@10
0,1,267.0,0.0017,0.0096,0.0195
1,5,85.0,0.0105,0.0495,0.094
2,10,32.75,0.0373,0.1499,0.2487
3,15,20.1,0.0653,0.2284,0.3466
4,20,12.8,0.1044,0.3165,0.4585
5,50,5.0,0.231,0.5262,0.6697
6,100,3.6,0.2749,0.5952,0.7199
7,200,3.05,0.3154,0.6203,0.7223
8,500,3.1,0.3532,0.6053,0.683
9,1000,3.5,0.3403,0.5593,0.6222


In [9]:
with open("CCA Data/embeddings_train1.pkl",'rb') as f:
    image_data = pickle.load(f)
    
with open("CCA Data/title_embeddings_train.pkl",'rb') as f:
    ingre_data = pickle.load(f)

image_train_data = image_data[0]
ingre_train_data = ingre_data[0]


with open("CCA Data/embeddings_test1.pkl",'rb') as f:
    image_data = pickle.load(f)
    
with open("CCA Data/title_embeddings_test.pkl",'rb') as f:
    ingre_data = pickle.load(f)

test_ids = image_data[2]
image_test_data = image_data[0]
ingre_test_data = ingre_data[0]


n_components = [1,5, 10, 15, 20,50, 100,200,500, 1000]
i = 0
metric_data = pd.DataFrame(columns = ['n_comp','medR','r@1', 'r@5','r@10'], index=range(len(n_components)))
embeddings_type = "title"
im2recipe = False
output_dir = "CCA_Final_Data_recipe2im/"
for n_comp in n_components:
    model = train(image_train_data, ingre_train_data, n_comp, output_dir, embeddings_type=embeddings_type)
    
    dataX, dataY = model.transform([image_test_data, ingre_test_data])
    medR, recall = evaluation(model, dataX, dataY, test_ids, im2recipe , 1000, 10)
    metric_data.loc[i] = [n_comp, medR, recall[1], recall[5], recall[10]]
    
    print("n_comp: " + str(n_comp) + " done..!")
    i+=1
    
if im2recipe:
    metric_data.to_csv(output_dir + "metrics_" + embeddings_type + "_im2recipe.csv", index=False)

else:
    metric_data.to_csv(output_dir + "metrics_" + embeddings_type + "_recipe2im.csv", index=False)

    

n_comp: 1 done..!
n_comp: 5 done..!
n_comp: 10 done..!
n_comp: 15 done..!
n_comp: 20 done..!
n_comp: 50 done..!
n_comp: 100 done..!
n_comp: 200 done..!
n_comp: 500 done..!
n_comp: 1000 done..!


In [10]:
metric_data

Unnamed: 0,n_comp,medR,r@1,r@5,r@10
0,1,284.9,0.002,0.0099,0.0196
1,5,122.25,0.0077,0.0387,0.0752
2,10,55.1,0.0194,0.0921,0.1641
3,15,38.75,0.0356,0.1345,0.2256
4,20,25.1,0.0515,0.1926,0.3078
5,50,10.1,0.129,0.3683,0.5076
6,100,8.6,0.1573,0.4148,0.5371
7,200,7.55,0.1914,0.443,0.5489
8,500,10.95,0.2161,0.43,0.5008
9,1000,27.45,0.1925,0.3621,0.4171


In [1]:
import pickle

In [3]:
with open('CCA_Final_Data/_instructions_200','rb') as f:
    model = pickle.load(f)

In [9]:
model.weights[1].shape

(1024, 200)