In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert
import re
import math
from keras.models import Sequential
from keras.layers import Input



In [2]:
# !pip install tensorflow
# !pip install tensorflow_hub
# !pip install bert-for-tf2
# !pip install sentencepiece

# Data Import & Preprocssing

In [3]:
#import dataset x: tweets y:gif
df=pd.read_csv("tgif-v1.0.tsv", sep='\t') 
df.head()

Unnamed: 0,y,x
0,https://38.media.tumblr.com/9f6c25cc350f12aa74...,"a man is glaring, and someone with sunglasses ..."
1,https://38.media.tumblr.com/9ead028ef62004ef6a...,a cat tries to catch a mouse on a tablet
2,https://38.media.tumblr.com/9f43dc410be85b1159...,a man dressed in red is dancing.
3,https://38.media.tumblr.com/9f659499c8754e40cf...,an animal comes close to another in the jungle
4,https://38.media.tumblr.com/9ed1c99afa7d714118...,a man in a hat adjusts his tie and makes a wei...


In [4]:
df.isnull().values.any() #check if there's any null value
df.shape #check the shape


(125782, 2)

In [5]:
def text_preprocess(tweet):
    # Remove punctuations and numbers
    tweet = re.sub('[^a-zA-Z]', ' ', tweet)
    
    # Single character removal
    tweet = re.sub(r"\s+[a-zA-Z]\s+", ' ', tweet)

    # Removing multiple spaces
    tweet = re.sub(r'\s+', ' ', tweet)
    return tweet

In [6]:
x = []
for t in list(df['x']):
    x.append(text_preprocess(t))


# Tokenize Text to Vector W/ Bert

In [7]:
#create an object of the FullTokenizer class
BertTokenizer = bert.bert_tokenization.FullTokenizer

#create a BERT embedding layer by importing the BERT model from hub.KerasLayer 
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
# create a BERT vocabulary file in the form a numpy array
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()

to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

#ref: https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/

In [8]:
tokenizer.tokenize(x[1])
#ids of the tokens
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x[1])) 

[1037, 4937, 5363, 2000, 4608, 8000, 2006, 13855]

In [9]:
#tokenize all x 
# @warning each x are of different length
x_tokenized = []
for t in x:
    x_tokenized.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(t)))


In [10]:
x

['a man is glaring and someone with sunglasses appears ',
 'a cat tries to catch mouse on tablet',
 'a man dressed in red is dancing ',
 'an animal comes close to another in the jungle',
 'a man in hat adjusts his tie and makes weird face ',
 'someone puts cat on wrapping paper then wraps it up and puts on bow',
 'a brunette woman is looking at the man',
 'a man on bicycle is jumping over fence ',
 'a group of men are standing and staring in the same direction ',
 'a man with black clothes is dancing sexy',
 'a boy is happy parking and see another boy',
 'a man winks and man with fork motions him to be quiet ',
 'a man walks into room and sees girl floating above the bed',
 'the vehicle is moving fast into the tunnel',
 'a beatles show paul is on the front singing very enthusiastically',
 'a sport car is swinging on the race playground',
 'a woman is laughing and holding man the man is not laughing',
 'a man with lights on his jacket watching large screen tv',
 'a gloved hand opens to 

In [11]:
x_tokenized

[[1037, 2158, 2003, 16124, 1998, 2619, 2007, 17072, 3544],
 [1037, 4937, 5363, 2000, 4608, 8000, 2006, 13855],
 [1037, 2158, 5102, 1999, 2417, 2003, 5613],
 [2019, 4111, 3310, 2485, 2000, 2178, 1999, 1996, 8894],
 [1037, 2158, 1999, 6045, 14171, 2015, 2010, 5495, 1998, 3084, 6881, 2227],
 [2619,
  8509,
  4937,
  2006,
  12252,
  3259,
  2059,
  19735,
  2009,
  2039,
  1998,
  8509,
  2006,
  6812],
 [1037, 27261, 2450, 2003, 2559, 2012, 1996, 2158],
 [1037, 2158, 2006, 10165, 2003, 8660, 2058, 8638],
 [1037, 2177, 1997, 2273, 2024, 3061, 1998, 4582, 1999, 1996, 2168, 3257],
 [1037, 2158, 2007, 2304, 4253, 2003, 5613, 7916],
 [1037, 2879, 2003, 3407, 5581, 1998, 2156, 2178, 2879],
 [1037,
  2158,
  16837,
  2015,
  1998,
  2158,
  2007,
  9292,
  15323,
  2032,
  2000,
  2022,
  4251],
 [1037, 2158, 7365, 2046, 2282, 1998, 5927, 2611, 8274, 2682, 1996, 2793],
 [1996, 4316, 2003, 3048, 3435, 2046, 1996, 5234],
 [1037, 11555, 2265, 2703, 2003, 2006, 1996, 2392, 4823, 2200, 24935],
 [103

# Gif Preprocessing

train a subset of data with batch size 50

In [12]:
import requests
import os
os.chdir('/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/gifs')
!pwd
# print current path 
# print(os.getcwd())

/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/gifs


In [13]:
y = []
for g in list(df['y']):
    y.append(g)

In [14]:
def gif_downloader(image_urls, status=[], filenames = []):
    
    for index, img in enumerate(image_urls):
        # We can split the file based upon / and extract the last split within the python list below:
        file_name = img.split('/')[-1]
        #print("fThis is the file name: {file_name}")
        filenames.append(file_name) 
        # Now let's send a request to the image URL:
        r = requests.get(img, stream=True)
        # We can check that the status code is 200 before doing anything else:
        if r.status_code == 200:
            # This command below will allow us to write the data to a file as binary:
            with open(file_name, 'wb') as f:
                for chunk in r:
                    f.write(chunk)
            status.append(True)
        else:
            # We will write all of the images back to the broken_images list:
            status.append(False)
    return filenames, status

In [15]:
def load_gif(image_path, sess):
    #load gif using tf.io.readfile. make sure dir is correct if file not found
    image = tf.io.read_file(image_path)
    #use tf.decode to convert gif to a matrix
    image = tf.image.decode_gif(image)
    return sess.run(image)

In [16]:
q=[]
filenames=[]
filenames, q = gif_downloader(y[1:50], status=q)
q

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [17]:
#load a single gif and convert it to uint 8 tensor
y_raw = []
y_train = []
for file in filenames:
    with tf.compat.v1.Session() as sess:
        curr = load_gif(file, sess)
#         print(curr.shape)
        #y_train.append(tf.math.reduce_mean(tf.reshape(load_gif(file, sess), [-1]) , axis=None, keepdims=False, name=None)) #flatten the image to rank 1 for keras Input
        #print(tf.reshape(load_gif(file, sess), [-1]))
        #print(tf.reshape(tf.math.reduce_mean(tf.reshape(load_gif(file, sess), [-1]) , axis=None, keepdims=False, name=None))
#         tf.keras.layers.Flatten(curr)
#         print(curr.shape)
      
    tensor = tf.convert_to_tensor(
        curr, dtype='float32', dtype_hint=None, name=None
    )
    tensor
    y_raw.append(tensor)
    #take log to prevent overflow error
    print(tf.math.log(tf.math.reduce_mean(tf.image.total_variation(tensor))))
    y_train.append(tf.math.log(tf.math.reduce_mean(tf.image.total_variation(tensor))))
    #y_train.append(tf.math.reduce_mean(tf.image.total_variation(tensor)))

tf.Tensor(16.47508, shape=(), dtype=float32)
tf.Tensor(14.842497, shape=(), dtype=float32)
tf.Tensor(15.917022, shape=(), dtype=float32)
tf.Tensor(17.598543, shape=(), dtype=float32)
tf.Tensor(15.55175, shape=(), dtype=float32)
tf.Tensor(15.879038, shape=(), dtype=float32)
tf.Tensor(15.711447, shape=(), dtype=float32)
tf.Tensor(16.891788, shape=(), dtype=float32)
tf.Tensor(15.871132, shape=(), dtype=float32)
tf.Tensor(14.4411125, shape=(), dtype=float32)
tf.Tensor(14.918451, shape=(), dtype=float32)
tf.Tensor(14.231726, shape=(), dtype=float32)
tf.Tensor(13.980014, shape=(), dtype=float32)
tf.Tensor(15.479442, shape=(), dtype=float32)
tf.Tensor(14.531035, shape=(), dtype=float32)
tf.Tensor(15.694867, shape=(), dtype=float32)
tf.Tensor(17.308651, shape=(), dtype=float32)
tf.Tensor(16.117285, shape=(), dtype=float32)
tf.Tensor(15.567862, shape=(), dtype=float32)
tf.Tensor(13.883634, shape=(), dtype=float32)
tf.Tensor(16.39345, shape=(), dtype=float32)
tf.Tensor(14.626097, shape=(), dtype

In [18]:
with tf.compat.v1.Session() as sess:
    orginial = load_gif('tumblr_msij5q4Run1qd76t9o1_500.gif', sess)

    
tensor = tf.convert_to_tensor(
    orginial, dtype='float32', dtype_hint=None, name=None
)

print(orginial)#print original np array
print(tensor) #print converted tensor 
#dim is (frame,h,w,channel)
#print(tf.math.reduce_std(tensor, dtype=uint8))
#compute the pixel noise of every frame and take the noise average 
print(tf.math.log(tf.math.reduce_mean(tf.image.total_variation(tensor))))

[[[[249 251 242]
   [247 247 239]
   [249 251 242]
   ...
   [255 247 247]
   [251 253 243]
   [255 247 247]]

  [[252 253 243]
   [252 253 242]
   [252 253 242]
   ...
   [255 255 255]
   [255 255 247]
   [255 255 255]]

  [[252 253 242]
   [252 253 243]
   [252 253 242]
   ...
   [255 255 247]
   [255 255 247]
   [255 255 247]]

  ...

  [[206 198 198]
   [206 198 198]
   [206 198 198]
   ...
   [198 181 173]
   [189 181 173]
   [198 181 181]]

  [[206 198 198]
   [206 198 198]
   [214 198 198]
   ...
   [189 181 173]
   [189 181 173]
   [189 181 173]]

  [[222 214 214]
   [231 222 222]
   [222 214 214]
   ...
   [214 198 198]
   [206 198 198]
   [214 198 198]]]


 [[[255 255 255]
   [255 255 255]
   [255 255 255]
   ...
   [255 255 255]
   [255 255 255]
   [255 255 255]]

  [[255 255 255]
   [255 255 255]
   [255 255 255]
   ...
   [255 255 255]
   [255 255 255]
   [255 255 255]]

  [[255 255 255]
   [255 255 255]
   [255 255 255]
   ...
   [255 255 255]
   [255 255 255]
   [255 255

tf.Tensor(15.871132, shape=(), dtype=float32)


In [19]:
rank_3_tensor = tf.constant([
  [[0, 1, 2, 3, 4],
   [5, 6, 7, 8, 9]],
  [[10, 11, 12, 13, 14],
   [15, 16, 17, 18, 19]],
  [[20, 21, 22, 23, 24],
   [25, 26, 27, 28, 29]],])

#print(tf.reshape(rank_3_tensor, [-1]))
print(tf.math.reduce_sum(
    rank_3_tensor))

tf.Tensor(435, shape=(), dtype=int32)


In [20]:
x_train = x_tokenized[1:50] 
# x_train[2]
print(y_train[2])

tf.Tensor(15.917022, shape=(), dtype=float32)


# Training Data Final Preparation

In [21]:
#create a copy of x_train and y_train
#may not be actually working. Check python data mutability for insurancwe
x_train_cp = x_train
y_train_cp = y_train

In [22]:
# x_train
# tockenied tweet 

In [23]:
# y_train
# tensor gif 

In [24]:
#merge x and y and add x length 
x_train_with_len = [[x, y_train_cp[i], len(x_train_cp), 1]
                 for i, x in enumerate(x_train_cp)]
x_train_with_len

[[[1037, 4937, 5363, 2000, 4608, 8000, 2006, 13855],
  <tf.Tensor: shape=(), dtype=float32, numpy=16.47508>,
  49,
  1],
 [[1037, 2158, 5102, 1999, 2417, 2003, 5613],
  <tf.Tensor: shape=(), dtype=float32, numpy=14.842497>,
  49,
  1],
 [[2019, 4111, 3310, 2485, 2000, 2178, 1999, 1996, 8894],
  <tf.Tensor: shape=(), dtype=float32, numpy=15.917022>,
  49,
  1],
 [[1037, 2158, 1999, 6045, 14171, 2015, 2010, 5495, 1998, 3084, 6881, 2227],
  <tf.Tensor: shape=(), dtype=float32, numpy=17.598543>,
  49,
  1],
 [[2619,
   8509,
   4937,
   2006,
   12252,
   3259,
   2059,
   19735,
   2009,
   2039,
   1998,
   8509,
   2006,
   6812],
  <tf.Tensor: shape=(), dtype=float32, numpy=15.55175>,
  49,
  1],
 [[1037, 27261, 2450, 2003, 2559, 2012, 1996, 2158],
  <tf.Tensor: shape=(), dtype=float32, numpy=15.879038>,
  49,
  1],
 [[1037, 2158, 2006, 10165, 2003, 8660, 2058, 8638],
  <tf.Tensor: shape=(), dtype=float32, numpy=15.711447>,
  49,
  1],
 [[1037, 2177, 1997, 2273, 2024, 3061, 1998, 4582,

In [25]:
#sort dataset based on x len (the length of the tweet)
x_train_with_len.sort(key=lambda x: x[2])
sorted_data = [(data[0], data[1], data[3]) for data in x_train_with_len]

In [26]:
#convert them into tensorflow dataset
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_data, output_types=(tf.int32, tf.float32, tf.int32))

In [27]:
BATCH_SIZE = 10
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), (), ()))


In [28]:
# next(iter(batched_dataset))

In [29]:
TOTAL_BATCHES = math.ceil(len(sorted_data) / BATCH_SIZE)
# TEST_BATCHES = TOTAL_BATCHES // 10
TEST_BATCHES = 1
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)



In [30]:
batched_dataset

<PaddedBatchDataset shapes: ((None, None), (None,), (None,)), types: (tf.int32, tf.float32, tf.int32)>

In [31]:
# for element in test_data:
#     print(element)

In [32]:
train_data

<SkipDataset shapes: ((None, None), (None,), (None,)), types: (tf.int32, tf.float32, tf.int32)>

In [33]:
#normalizer = preprocessing.Normalization(input_shape=[1,])


# Multi Input 

In [34]:
model = Model(inputs=[model1.input, model2.input, model3.input], outputs=y)


NameError: name 'Model' is not defined

# Model

In [None]:
# https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/ 
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
#         self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
#                                          kernel_size=2,
#                                          padding="valid",
#                                          activation="relu")
#         self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
#                                          kernel_size=3,
#                                          padding="valid",
#                                          activation="relu")
#         self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
#                                          kernel_size=4,
#                                          padding="valid",
#                                          activation="relu")
#         self.pool = layers.GlobalMaxPool1D()

#         self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
#         self.dropout = layers.Dropout(rate=dropout_rate)
#         if model_output_classes == 2:
        self.last_dense = layers.Dense(units = 1)
#         self.last_dense = layers.Dense(units=1,
#                                             activation="sigmoid")
#         else:
#             self.last_dense = layers.Dense(units=model_output_classes,
#                                            activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
#         l_1 = self.cnn_layer1(l) 
#         l_1 = self.pool(l_1) 
#         l_2 = self.cnn_layer2(l) 
#         l_2 = self.pool(l_2)
#         l_3 = self.cnn_layer3(l)
#         l_3 = self.pool(l_3) 
        
#         concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
#         concatenated = self.dense_1(concatenated)
#         concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(l)
        
        return model_output

In [None]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHS = 20
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)
text_model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.1), loss='mean_absolute_error',metrics=["accuracy"])

In [None]:
history = text_model.fit(train_data, epochs=NB_EPOCHS)

In [None]:
import matplotlib.pyplot as plt
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.ylim([0, 10])
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.legend()
    plt.grid(True)
    
plot_loss(history)

In [None]:
results = text_model.evaluate(test_data)
print(results)


In [None]:
#

In [None]:
!pip install plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
print(history.history.keys())

In [None]:
fig = go.Figure()
fig.add_trace(go.Scattergl(y=history.history['loss'],
                    name='Train'))

fig.add_trace(go.Scattergl(y=history.history['val_loss'],
                     name='Valid'))

fig.update_layout(height=500, width=700,
                  xaxis_title='Epoch',
                  yaxis_title='Loss')
fig.show()

In [None]:
results = model.evaluate(test_data)

print(results)


In [None]:
test_data

In [None]:
batched_dataset

In [None]:
# model.fit_generator(
#     generator=itr_train, validation_data=itr_valid, validation_steps=batch_size,
#     epochs=epochs, steps_per_epoch=num_batches, callbacks=cbs, verbose=1, workers=0)

