<a href="https://colab.research.google.com/github/smze/T3-1/blob/main/Phase03_Part01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 1
## Subpart 1: Multimodal texts & images

In [1]:
# download image datasets
!pip install --upgrade --no-cache-dir gdown

!gdown "1GAZgPpTUBSfhne-Tp0GDkvSHuq6EMMbj&export=download"
!unzip "/content/train_ende.zip"

!gdown "1B9ZFmSTqfTMaqJ15nQDrRNLqBvo-B39W&export=download"
!unzip "/content/test.zip"

!gdown "12HM8uVNjFg-HRZ15ADue4oLGFAYQwvTA&export=download"
!unzip "/content/dev.zip"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: dev/1054.jpg            
  inflating: dev/1055.jpg            
  inflating: dev/1056.jpg            
  inflating: dev/1057.jpg            
  inflating: dev/1058.jpg            
  inflating: dev/1059.jpg            
  inflating: dev/106.jpg             
  inflating: dev/1060.jpg            
  inflating: dev/1061.jpg            
  inflating: dev/1062.jpg            
  inflating: dev/1063.jpg            
  inflating: dev/1064.jpg            
  inflating: dev/1065.jpg            
  inflating: dev/1066.jpg            
  inflating: dev/1067.jpg            
  inflating: dev/1068.jpg            
  inflating: dev/1069.jpg            
  inflating: dev/107.jpg             
  inflating: dev/1070.jpg            
  inflating: dev/1071.jpg            
  inflating: dev/1072.jpg            
  inflating: dev/1073.jpg            
  inflating: dev/1074.jpg            
  inflating: dev/1075.jpg            
  inflating: dev/1076.j

In [2]:
# libraries

import os
import re
import zipfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

from tqdm.notebook import tqdm
from pickle import dump, load

from pathlib import Path
from PIL import Image
import cv2

import tensorflow as tf
from keras.applications import DenseNet121
from tensorflow.keras import layers, activations, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import load_img, img_to_array, to_categorical

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# make dataset

img_train = Path('/content/train_ende')
img_train = list(img_train.glob(r'**/*.jpg'))
img_train = pd.Series(img_train).astype(str)
img_train = pd.Series(sorted(img_train, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_train = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_train.txt', names=['sentences'], header=None)
lbl_train = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/sentiment_train.txt',names=['labels'], header=None)
train_set = pd.concat([img_train, txt_train, lbl_train], axis=1)

img_val = Path('/content/dev')
img_val = list(img_val.glob(r'**/*.jpg'))
img_val = pd.Series(img_val).astype(str)
img_val = pd.Series(sorted(img_val, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_val = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_dev.txt', names=['sentences'], header=None)
lbl_val = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/sentiment_dev.txt',names=['labels'], header=None)
val_set = pd.concat([img_val, txt_val, lbl_val], axis=1)

img_test = Path('/content/test')
img_test = list(img_test.glob(r'**/*.jpg'))
img_test = pd.Series(img_test).astype(str)
img_test = pd.Series(sorted(img_test, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_test = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_test.txt', names=['sentences'], header=None)
lbl_test = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/sentiment_test.txt',names=['labels'], header=None)
test_set = pd.concat([img_test, txt_test, lbl_test], axis=1)

In [4]:
def resize(img):
  img = plt.imread(img)
  resized_image = cv2.resize(img, (128,128), interpolation=cv2.INTER_AREA)
  resized_image = np.expand_dims(resized_image, axis=0)
  return resized_image

In [5]:
resized_train_images = np.array([resize(img) for img in train_set['images']]).reshape(len(train_set['images']), 128, 128, 3)
resized_val_images = np.array([resize(img) for img in val_set['images']]).reshape(len(val_set['images']), 128, 128, 3)
resized_test_images = np.array([resize(img) for img in test_set['images']]).reshape(len(test_set['images']), 128, 128, 3)

train_labels = train_set.labels.values
val_labels = val_set.labels.values
test_labels = test_set.labels.values

In [6]:
def clean_text(sentence):
  sentence = str(sentence).lower()
  sentence = re.sub('[^a-z]',' ',sentence)
  sentence = word_tokenize(sentence)
  stop_words = stopwords.words('english')
  sentence = [i for i in sentence if i not in stop_words]
  sentence = [i for i in sentence if len(i)>1]
  sentence = ' '.join(sentence)
  return sentence

In [7]:
train_set['sentences'] = train_set['sentences'].apply(clean_text)
val_set['sentences'] = val_set['sentences'].apply(clean_text)
test_set['sentences'] = test_set['sentences'].apply(clean_text)

train_sentences = train_set.sentences.values
val_sentences = val_set.sentences.values
test_sentences = test_set.sentences.values

In [8]:
!wget --no-check-certificate \
http://nlp.stanford.edu/data/glove.6B.zip \
-O /tmp/glove.6B.zip

with zipfile.ZipFile('/tmp/glove.6B.zip', 'r') as zip_ref:
    zip_ref.extractall('/tmp/glove')

f = open('/tmp/glove/glove.6B.50d.txt')

glove_vecs = {}

for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype="float32")
    glove_vecs[word] = vec
f.close()

--2023-02-12 07:48:24--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-02-12 07:48:25--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-02-12 07:48:25--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘/tmp/glove.6B.zip’


In [9]:
all_centences = np.append(train_sentences,val_sentences)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_centences)
word2idx = tokenizer.word_index

glove_embedding_matrix = np.zeros((len(word2idx) + 1, 50))
for word, i in word2idx.items():
  if i < len(word2idx):
    embedding_vector = glove_vecs.get(word)
    if embedding_vector is not None:
      glove_embedding_matrix[i] = embedding_vector

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=10)

val_sequences = tokenizer.texts_to_sequences(val_sentences)
val_padded = pad_sequences(val_sequences, maxlen=10)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=10)

In [10]:
# texts representation

txt_input = layers.Input(shape=(None,))
embed_layer = layers.Embedding(input_dim=glove_embedding_matrix.shape[0],
                                    output_dim=glove_embedding_matrix.shape[1],
                                    mask_zero=True,
                                    weights=[glove_embedding_matrix],
                                    trainable=False)(txt_input)
txt_representation = layers.Bidirectional(layers.LSTM(512))(embed_layer)

In [11]:
# image representation

dense_layer = DenseNet121(weights='imagenet', input_shape=(128,128,3), include_top=False)
dense_layer.trainable = False
img_input = layers.Input(shape=(128,128,3))
x = dense_layer(img_input, training=False)
x = layers.Flatten()(x)
img_representation = layers.Dense(256, activation='relu', kernel_initializer='he_normal')(x)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5


In [12]:
concat = layers.concatenate([img_representation,txt_representation], axis=-1)
output = layers.Dense(3, activation='softmax')(concat)
model = models.Model(inputs=[img_input, txt_input], outputs=output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 128, 128, 3  0           []                               
                                )]                                                                
                                                                                                  
 densenet121 (Functional)       (None, 4, 4, 1024)   7037504     ['input_3[0][0]']                
                                                                                                  
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 flatten (Flatten)              (None, 16384)        0           ['densenet121[0][0]']        

In [13]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit([resized_train_images, train_padded], train_labels, epochs=10, batch_size=64, validation_data=([resized_val_images, val_padded], val_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6768436e50>

In [14]:
y_pred = model.predict([resized_test_images, test_padded])
y_pred = np.argmax(y_pred, axis=1)
print('Accuracy: ', accuracy_score(test_labels, y_pred))

Accuracy:  0.4487862640615749


In [15]:
# free up RAM
%reset -f

## Subpart 2: Pre-trained Transformer Backbones

In [16]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m102.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1


In [17]:
# libraries

import os
import re
import zipfile

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score

from tqdm.notebook import tqdm

from pathlib import Path
from PIL import Image

from transformers import AutoProcessor
from tensorflow.keras import layers, activations, models
from tensorflow.keras.preprocessing.sequence import pad_sequences

# pre-trained transformer for image-text representation
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

Downloading (…)rocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [18]:
# make dataset

img_train = Path('/content/train_ende')
img_train = list(img_train.glob(r'**/*.jpg'))
img_train = pd.Series(img_train).astype(str)
img_train = pd.Series(sorted(img_train, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_train = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_train.txt', names=['sentences'], header=None)
train_set = pd.concat([img_train, txt_train], axis=1)

img_val = Path('/content/dev')
img_val = list(img_val.glob(r'**/*.jpg'))
img_val = pd.Series(img_val).astype(str)
img_val = pd.Series(sorted(img_val, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_val = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_dev.txt', names=['sentences'], header=None)
val_set = pd.concat([img_val, txt_val], axis=1)

img_test = Path('/content/test')
img_test = list(img_test.glob(r'**/*.jpg'))
img_test = pd.Series(img_test).astype(str)
img_test = pd.Series(sorted(img_test, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_test = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_test.txt', names=['sentences'], header=None)
test_set = pd.concat([img_test, txt_test], axis=1)

In [19]:
def text_representation(dataset):
  text_representation=[]
  for i in tqdm(range(len(dataset))):
    text_representation.append(processor(text=dataset['sentences'][i], images=Image.open(dataset['images'][i]), padding=True)['input_ids'])
  text_representation = pad_sequences(text_representation, maxlen=15)
  return text_representation

train_text_representation = text_representation(train_set[:5000])
np.save('train_text_representation.npy', train_text_representation)
val_text_representation = text_representation(val_set[:1000])
np.save('val_text_representation.npy', val_text_representation)
test_text_representation = text_representation(test_set[:1000])
np.save('test_text_representation.npy', test_text_representation)

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [20]:
%reset -f

# libraries

import os
import re
import zipfile

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score

from tqdm.notebook import tqdm

from pathlib import Path
from PIL import Image

from transformers import AutoProcessor
from tensorflow.keras import layers, activations, models

processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

# make dataset

img_train = Path('/content/train_ende')
img_train = list(img_train.glob(r'**/*.jpg'))
img_train = pd.Series(img_train).astype(str)
img_train = pd.Series(sorted(img_train, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_train = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_train.txt', names=['sentences'], header=None)
train_set = pd.concat([img_train, txt_train], axis=1)

img_val = Path('/content/dev')
img_val = list(img_val.glob(r'**/*.jpg'))
img_val = pd.Series(img_val).astype(str)
img_val = pd.Series(sorted(img_val, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_val = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_dev.txt', names=['sentences'], header=None)
val_set = pd.concat([img_val, txt_val], axis=1)

img_test = Path('/content/test')
img_test = list(img_test.glob(r'**/*.jpg'))
img_test = pd.Series(img_test).astype(str)
img_test = pd.Series(sorted(img_test, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_test = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_test.txt', names=['sentences'], header=None)
test_set = pd.concat([img_test, txt_test], axis=1)

In [21]:
def image_representation(dataset):
  image_representation=[]
  for i in tqdm(range(len(dataset))):
    image_representation.append(np.expand_dims(processor(text=dataset['sentences'][i],
                                                         images=Image.open(dataset['images'][i]),
                                                         padding=True)['pixel_values'], axis=0))
  return image_representation

train_image_representation = image_representation(train_set[:5000])
np.save('train_image_representation.npy', train_image_representation)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [22]:
%reset -f

# libraries

import os
import re
import zipfile

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
from pathlib import Path
from PIL import Image

from transformers import AutoProcessor
from tensorflow.keras import layers, activations, models

processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

# make dataset

img_train = Path('/content/train_ende')
img_train = list(img_train.glob(r'**/*.jpg'))
img_train = pd.Series(img_train).astype(str)
img_train = pd.Series(sorted(img_train, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_train = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_train.txt', names=['sentences'], header=None)
train_set = pd.concat([img_train, txt_train], axis=1)

img_val = Path('/content/dev')
img_val = list(img_val.glob(r'**/*.jpg'))
img_val = pd.Series(img_val).astype(str)
img_val = pd.Series(sorted(img_val, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_val = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_dev.txt', names=['sentences'], header=None)
val_set = pd.concat([img_val, txt_val], axis=1)

img_test = Path('/content/test')
img_test = list(img_test.glob(r'**/*.jpg'))
img_test = pd.Series(img_test).astype(str)
img_test = pd.Series(sorted(img_test, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_test = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_test.txt', names=['sentences'], header=None)
test_set = pd.concat([img_test, txt_test], axis=1)

In [23]:
def image_representation(dataset):
  image_representation=[]
  for i in tqdm(range(len(dataset))):
    image_representation.append(np.expand_dims(processor(text=dataset['sentences'][i],
                                                         images=Image.open(dataset['images'][i]),
                                                         padding=True)['pixel_values'], axis=0))
  return image_representation

val_image_representation = image_representation(val_set[:1000])
np.save('val_image_representation.npy', val_image_representation)
test_image_representation = image_representation(test_set[:1000])
np.save('test_image_representation.npy', test_image_representation)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [24]:
%reset -f

# libraries

import os
import re
import zipfile

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

from PIL import Image
from pathlib import Path
from tqdm.notebook import tqdm

from transformers import AutoProcessor
from tensorflow.keras import layers, activations, models

# make dataset

img_train = Path('/content/train_ende')
img_train = list(img_train.glob(r'**/*.jpg'))
img_train = pd.Series(img_train).astype(str)
img_train = pd.Series(sorted(img_train, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_train = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_train.txt', names=['sentences'], header=None)
lbl_train = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/sentiment_train.txt',names=['labels'], header=None)
train_set = pd.concat([img_train, txt_train, lbl_train], axis=1)

img_val = Path('/content/dev')
img_val = list(img_val.glob(r'**/*.jpg'))
img_val = pd.Series(img_val).astype(str)
img_val = pd.Series(sorted(img_val, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_val = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_dev.txt', names=['sentences'], header=None)
lbl_val = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/sentiment_dev.txt',names=['labels'], header=None)
val_set = pd.concat([img_val, txt_val, lbl_val], axis=1)

img_test = Path('/content/test')
img_test = list(img_test.glob(r'**/*.jpg'))
img_test = pd.Series(img_test).astype(str)
img_test = pd.Series(sorted(img_test, key=lambda x:int(re.findall('(\d+)', x)[0])), name='images')
txt_test = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/english_test.txt', names=['sentences'], header=None)
lbl_test = pd.read_fwf('https://github.com/XL2248/MSCTD/raw/main/MSCTD_data/ende/sentiment_test.txt',names=['labels'], header=None)
test_set = pd.concat([img_test, txt_test, lbl_test], axis=1)

In [25]:
train_text_representation = np.load('/content/train_text_representation.npy', allow_pickle=True)
val_text_representation = np.load('/content/val_text_representation.npy', allow_pickle=True)
test_text_representation = np.load('/content/test_text_representation.npy', allow_pickle=True)

train_image_representation = np.load('/content/train_image_representation.npy', allow_pickle=True)
train_image_representation = np.array(train_image_representation, dtype='float16').reshape(len(train_image_representation), 224, 224, 3)
val_image_representation = np.load('/content/val_image_representation.npy', allow_pickle=True)
val_image_representation = np.array(val_image_representation, dtype='float16').reshape(len(val_image_representation), 224, 224, 3)
test_image_representation = np.load('/content/test_image_representation.npy', allow_pickle=True)
test_image_representation = np.array(test_image_representation, dtype='float16').reshape(len(test_image_representation), 224, 224, 3)

train_labels = train_set['labels'][:5000].values
val_labels = val_set['labels'][:1000].values
test_labels = test_set['labels'][:1000].values

In [26]:
text_input = layers.Input(shape=(15,))
text_layer = layers.Dense(64, activation="relu", kernel_initializer='he_normal')(text_input)

image_input = layers.Input(shape=(224,224,3))
image_layer = layers.Flatten()(image_input)

concat = layers.concatenate([image_layer,text_layer])

fc = layers.Dense(256, activation="relu", kernel_initializer='he_normal')(concat)
total_output = layers.Dense(3, activation="softmax")(fc)

model = models.Model(inputs=[text_input,image_input], outputs=total_output)

In [27]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit([train_text_representation, train_image_representation], train_labels, epochs=15, batch_size=128,
          validation_data=([val_text_representation, val_image_representation], val_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f66c41eaa00>

In [28]:
y_pred = model.predict([test_text_representation, test_image_representation])
y_pred = np.argmax(y_pred, axis=1)
print('Accuracy: ', accuracy_score(test_labels, y_pred))

Accuracy:  0.368
