In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2

In [None]:
# constants
IMG_SIZE = (256, 256)
BATCH_SIZE = 32
EPOCHS = 100
SEQUENCE_LENGTH = 75
VOCAB_SIZE = 20
TOKEN_LEN = 48

In [None]:
dataset_path = '../input/pix2code/android/all_data/'
images_paths = [f for f in os.listdir(dataset_path) if f.endswith('.png')]
codes_paths = [f for f in os.listdir(dataset_path) if f.endswith('.gui')]
images_paths.sort()
codes_paths.sort()
print(len(images_paths), len(codes_paths))

### Reading the Dataset

In [None]:
vocab_set = set()

def code_preprocess(c):
    c = c.replace('\n', ' ')
    c = c.replace('\t', ' ')
    c = c.replace('{', ' { ')
    c = c.replace('}', ' } ')
    c = c.replace(',', ' , ')
    c = c.replace('  ', ' ')
    # for w in c.split(' '):
    #     vocab_set.add(w)
    # c = c.replace(':', ' : ')
    # c = c.replace('(', ' ( ')
    # c = c.replace(')', ' ) ')
    # c = c.replace(';', ' ; ')
    # c = c.replace('=', ' = ')
    return c

image_data = []
code_data = []
for i in range(len(images_paths)):
    img = cv2.imread(dataset_path + images_paths[i])
    img = cv2.resize(img, IMG_SIZE)
    image_data.append(img/255.0)
    with open(dataset_path + codes_paths[i], 'r') as f:
        c = f.read()
        c = code_preprocess(c)
        code_data.append('<START> ' + c + ' <END>')


In [None]:
code_vectorizer = keras.layers.TextVectorization(
    output_mode='int',
    standardize=None,
    output_sequence_length=SEQUENCE_LENGTH
)
code_vectorizer.adapt(code_data)

code_data = code_vectorizer(code_data)

print(len(image_data), len(code_data))

In [None]:
print(code_data[200])
plt.imshow(image_data[200])

In [None]:
code_vocab = code_vectorizer.get_vocabulary()
VOCAB_SIZE = len(code_vocab)
print(code_vocab, VOCAB_SIZE)

In [None]:
def prepare_dataset(image_data, code_data):
    dataset = []

    for i in range(len(image_data)):
        for j in range(1, min(TOKEN_LEN, len(code_data[i]))):
            dataset.append((
                {
                    "img" : image_data[i],
                    "code" : tf.expand_dims(tf.concat([code_data[i][:j] , np.array([0] * (TOKEN_LEN - j))], axis=-1), axis=1)
                },
                code_data[i][j]
            ))
            if (code_vocab[code_data[i][j]]) == '<END>' : 
                break
        for j in range(len(code_data[i]) - TOKEN_LEN):
            dataset.append((
                {
                    "img" : image_data[i],
                    "code" : tf.expand_dims(code_data[i][j: j+TOKEN_LEN] , axis=1)
                },
                code_data[i][j+TOKEN_LEN]
            ))
            if (code_vocab[code_data[i][j]]) == '<END>' : 
                break
    return dataset

dataset = prepare_dataset(image_data, code_data)

In [None]:
print(len(dataset))

### Creating the Model

In [None]:
# Vision model
image_input = layers.Input(shape=(IMG_SIZE[0], IMG_SIZE[1], 3), name='img')
x = layers.Conv2D(32, 3, strides=1)(image_input)
x = layers.MaxPool2D(2)(x)
x = layers.Conv2D(64, 3, strides=1)(x)
x = layers.MaxPool2D(2)(x)
x = layers.Conv2D(128, 3, strides=1)(x)
x = layers.MaxPool2D(2)(x)
x = layers.Flatten()(x)
x = layers.Dense(1024, activation='relu')(x)
p = layers.Dense(1024, activation='relu')(x)
# print(p.shape)

# Language model
# First LSTM model
code_input = layers.Input(shape=(TOKEN_LEN, 1))
# x = layers.Embedding(VOCAB_SIZE, 256, mask_zero=True)(code_input)
x = layers.Masking(mask_value=0)(code_input)
x = layers.LSTM(128, return_sequences=True)(x)
qt = layers.LSTM(128)(x)
# print(qt.shape)

# Second LSTM model
rt = layers.Concatenate()([p, qt])
rt = tf.expand_dims(rt, axis=1)
x = layers.LSTM(512)(rt)
xt = layers.Dense(VOCAB_SIZE+2, activation='softmax')(x)
# xt = layers.Dense(1)(x)
# print(xt.shape)

# Model
pix2code = keras.Model(inputs=[image_input, code_input], outputs=xt)


In [None]:
pix2code.summary()

### Custom Training loop

In [None]:
optimizer = keras.optimizers.RMSprop(learning_rate=0.0001)

In [None]:
import random

def training(epochs=10, batch=32):
    for epoch in range(epochs):
        random.shuffle(dataset)
        print(f"Epoch {epoch+1}/{epochs}")
        for i in range(len(dataset)//batch):
            img = []
            code = []
            outputs = []
            for j in range(i*batch, (i+1)*batch):
                img.append(dataset[j][0]['img'])
                code.append(dataset[j][0]['code'])
                outputs.append(tf.one_hot(dataset[j][1], VOCAB_SIZE+2))
            img = np.array(img, dtype=np.float32)
            code = np.array(code, dtype=np.float32)
            outputs = np.array(outputs, dtype=np.float32) 
              

            with tf.GradientTape() as tape:
                predictions = pix2code([img, code], training=True)
                loss = tf.keras.losses.categorical_crossentropy(outputs, predictions)
                loss = tf.reduce_mean(loss)
                print(f"\t\tBatch: {i+1}  ||  Loss: {loss}")
            gradients = tape.gradient(loss, pix2code.trainable_variables)
            optimizer.apply_gradients(zip(gradients, pix2code.trainable_variables))
            
#             if (i == 5) : 
#                 break
        pix2code.save('pix2code.keras')

training(epochs=5, batch=256)

In [None]:
pix2code.save('pix2code.keras')

In [None]:
## code_vocab = code_vectorizer.get_vocabulary()
index_to_code = {i: j for i,j in enumerate(code_vocab)}
code_to_index = {v: k for k,v in enumerate(code_vocab)}

# print(index_to_code)
# print(code_to_index)

def generate_code(img, max_length=80):
    img = np.array([img], dtype=np.float32)
    final_code = '<START>'
    code = code_vectorizer(final_code)[:TOKEN_LEN]
    code = tf.expand_dims(code, axis=1)
    for i in range(max_length):
        prediction = pix2code([img, np.array([code])], training=False)
#         print(prediction)
        prediction = tf.argmax(prediction , axis=-1)
        final_code = final_code + ' ' + index_to_code[int(prediction[0])]
        if len(final_code.split()) > TOKEN_LEN : 
            code = code_vectorizer(final_code)[-TOKEN_LEN:]
        else :
            code = code_vectorizer(final_code)[:TOKEN_LEN]
#         print(code)
        if int(prediction[0]) == code_to_index['<END>']:
            break
    print(final_code)
    
idx = 2
generate_code(image_data[idx])
with open(dataset_path + '/' + codes_paths[idx]) as f:
    print(f.read())
plt.imshow(image_data[idx])

In [None]:
import pickle

# with open('code_vectorizer.pickle', 'wb') as f:
#     pickle.dump(code_vectorizer, f)

with open('index_to_code.pickle', 'wb') as f:
    pickle.dump(index_to_code, f)

with open('code_to_index.pickle', 'wb') as f:
    pickle.dump(code_to_index, f)
    
# with open('dataset.pickle', 'wb') as f:
#     pickle.dump(dataset, f)

In [None]:
code_vocab

In [None]:
# trained for total of epochs : 10 + 3 + 5 + 3 + 5 + 5