In [46]:
import numpy as np
from pathlib import Path
import pandas as pd

In [47]:
train_jsonl = pd.read_json('data/train.jsonl', lines=True).set_index('id')
dev_jsonl = pd.read_json('data/dev.jsonl', lines=True).set_index('id')
test_jsonl = pd.read_json('data/test.jsonl', lines=True).set_index('id')

train_jsonl['split'] = 'train'
dev_jsonl['split'] = 'dev'
test_jsonl['split'] = 'test'

merged_df = pd.concat([train_jsonl, dev_jsonl, test_jsonl])

In [145]:
image_vectors_folder = Path("info") / "scene"
image_vectors = dict()
for image_vector_file in image_vectors_folder.iterdir():
    image_id = int(image_vector_file.stem)
    with image_vector_file.open() as f:
        vector = np.array([float(scalar) for scalar in f.read().split('\n') if scalar != ''])
    image_vectors[image_id] = vector
    
merged_df['image_vector'] = merged_df.index.map(image_vectors)

In [118]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.stem import WordNetLemmatizer

splitted = [nltk.word_tokenize(doc) for doc in merged_df['text']]
lemmatizer = WordNetLemmatizer()
lemmatized = [[lemmatizer.lemmatize(word) for word in doc] for doc in splitted]
tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(lemmatized)]
doc2vec = Doc2Vec(tagged_docs)

text_vectors = np.array([doc2vec.infer_vector(doc) for doc in lemmatized]
merged_df['text_vector'] = text_vectors
merged_df['image_vector'] = 

In [134]:
learning_df = pd.DataFrame(merged_df, columns=['text_vector', 'image_vector', 'label', 'split'])
learning_df['label'] = learning_df['label'].astype(bool)
learning_df.to_pickle("learning_df.pickle", protocol=4)

In [138]:
learning_df['image_vector']

id
42953    [9.243041745321534e-07, 0.0010814660927280784,...
23058    [4.7280764192692e-06, 0.0002732583088800311, 5...
13894    [1.44773900956352e-06, 1.3311026123119518e-05,...
37408    [2.800878974085208e-06, 3.871678563882597e-05,...
82403    [1.6485433889101841e-06, 0.0007236988167278469...
                               ...                        
3869     [3.028519017789222e-07, 6.190018211782444e-06,...
23817    [4.958599575388689e-08, 6.375514089995704e-07,...
56280    [6.673790630884469e-05, 0.006667210254818201, ...
29384    [2.6612129659042694e-05, 0.0003297853982076049...
34127    [1.7552205235915608e-06, 0.0005712925922125578...
Name: image_vector, Length: 10000, dtype: object