In [1]:
import tensorflow as tf 
import tensorflow_hub as hub 
from tensorflow.keras import layers 
import bert 
import re 
# re — Regular expression operations
import math
import csv
import pandas as pd                     
import cv2
from PIL import Image


In [2]:
data = pd.read_csv("/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/tgif-v1.0.tsv", sep='\t')
data.isnull().values.any()
data.shape

(125782, 2)

In [3]:
# data.head()

In [4]:
gif_links = list(data.y.values)
raw_tweets = list(data.x.values)

In [5]:
# gif_links

In [6]:
# raw_tweets

# Tweet Pre-Process 

## Remove Special Char

In [7]:
# definition for function for removing html tags 
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [8]:
# definition for function for remove any punctuations and special characters
def preprocess_text(raw_tweaet):
    # Removing html tags
    tweet = remove_tags(raw_tweaet)
    # Removing html tags
    tweet = re.sub('[^a-zA-Z]', '', tweet)
    # Removing html tags
    tweet = re.sub(r"\s+[a-zA-Z]\s+", ' ', tweet)
    # Removing multiple spaces
    tweet = re.sub(r'\s+', ' ', tweet)
    return tweet

In [9]:
# run the preprocess_text function to clean tweets list 
tweets = [] 
for tweet in raw_tweets[:50]:
    tweets.append(preprocess_text(tweet))

## Tokenizing 

In [10]:
# Create a tokenizer 
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
# .numpy(): converts a tensor object into an numpy.ndarray
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [11]:
# Definition for function for convert tweet to ids 
def tokenize_tweets(text_tweets):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_tweets))

In [12]:
# run the tokenize_tweets on tweets 
tokenized_tweets = [tokenize_tweets(tweet) for tweet in tweets]

# GIF Pre-Process

In [13]:
import requests
# Requests is an elegant and simple HTTP library for Python
import os 
# os — Miscellaneous operating system interfaces¶
os.chdir('/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/gifs')
# !pwd

In [14]:
def gif_downloader(image_urls, status=[], filenames = []):
    
    for index, img in enumerate(image_urls):
        # We can split the file based upon / and extract the last split within the python list below:
        file_name = img.split('/')[-1]
        #print("fThis is the file name: {file_name}")
        filenames.append(file_name) 
        # Now let's send a request to the image URL:
        r = requests.get(img, stream=True)
        # We can check that the status code is 200 before doing anything else:
        if r.status_code == 200:
            # This command below will allow us to write the data to a file as binary:
            with open(file_name, 'wb') as f:
                for chunk in r:
                    f.write(chunk)
            status.append(True)
        else:
            # We will write all of the images back to the broken_images list:
            status.append(False)
    return filenames, status

In [15]:
# def load_gif(image_path, sess):
#     image = tf.io.read_file(image_file)
#     image = tf.io.decode_gif(image)
#     return sess.run(image)

In [16]:
download_status = []
filenames = []
filenames, download_status = gif_downloader(gif_links[:50], status=download_status)
# if any gif was not downloaded successfully 
print(any(status == False for status in download_status))

False


In [17]:
# filenames

In [18]:
def get_avg_fps(PIL_Image_object):
    """ Returns the average framerate of a PIL Image object """
    PIL_Image_object.seek(0)
    frames = duration = 0
    while True:
        try:
            frames += 1
            duration += PIL_Image_object.info['duration']
            PIL_Image_object.seek(PIL_Image_object.tell() + 1)
        except EOFError:
            return frames / duration * 1000
    return None

In [19]:
# In GIF files, each frame has its own duration. So there is no general fps for a GIF file. 
framerates = []
for gif_file_path in filenames: 
#     cap=cv2.VideoCapture("gif_file_path")
#     fps = cap.get(cv2.CAP_PROP_FPS)
    gif_obj = Image.open(gif_file_path)
    framerates.append(get_avg_fps(gif_obj))

In [20]:
# framerates

# Prerparing Data For Training


In [21]:
x_train = tokenized_tweets
y_train = framerates

In [26]:
# len(x_train)

In [25]:
tweets_with_len = [[tweet, y_train[i], len(tweet)]
                 for i, tweet in enumerate(x_train)]

In [27]:
#sort the data by tweet length 
tweets_with_len.sort(key=lambda x: x[2])
#remove the tweet length attribute from dataset 
sorted_tweets_labels = [(tweet_lab[0], tweet_lab[1]) for tweet_lab in tweets_with_len]

In [29]:
# sorted_tweets_labels

In [33]:
# convert the sorted dataset into a TensorFlow 2.0-compliant input dataset shape
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_tweets_labels, output_types=(tf.int32, tf.float32))


In [34]:
BATCH_SIZE = 10 
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [39]:
# next(iter(batched_dataset))


In [45]:
TOTAL_BATCHES = math.ceil(len(sorted_tweets_labels) / BATCH_SIZE)
TEST_BATCHES = 1 
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

# Creating the Model


In [59]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.last_dense = layers.Dense(units = 1)
#         self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
#                                         kernel_size=2,
#                                         padding="valid",
#                                         activation="relu")
#         self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
#                                         kernel_size=3,
#                                         padding="valid",
#                                         activation="relu")
#         self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
#                                         kernel_size=4,
#                                         padding="valid",
#                                         activation="relu")
#         self.pool = layers.GlobalMaxPool1D()
        
#         self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
#         self.dropout = layers.Dropout(rate=dropout_rate)
#         if model_output_classes == 2:
#             self.last_dense = layers.Dense(units=1,
#                                            activation="sigmoid")
#         else:
#             self.last_dense = layers.Dense(units=model_output_classes,
#                                            activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
#         l_1 = self.cnn_layer1(l) 
#         l_1 = self.pool(l_1) 
#         l_2 = self.cnn_layer2(l) 
#         l_2 = self.pool(l_2)
#         l_3 = self.cnn_layer3(l)
#         l_3 = self.pool(l_3) 
        
#         concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
#         concatenated = self.dense_1(concatenated)
#         concatenated = self.dropout(concatenated, training)
#         model_output = self.last_dense(concatenated)
        model_output = self.last_dense(l)
        
        return model_output

In [60]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 20

In [61]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)


In [62]:
# if OUTPUT_CLASSES == 2:
#     text_model.compile(loss="binary_crossentropy",
#                        optimizer="adam",
#                        metrics=["accuracy"])
# else:
#     text_model.compile(loss="sparse_categorical_crossentropy",
#                        optimizer="adam",
#                        metrics=["sparse_categorical_accuracy"])
text_model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.1), loss='mean_absolute_error',metrics=["accuracy"])

In [63]:
text_model.fit(train_data, epochs=NB_EPOCHS)


Epoch 1/20


NameError: in user code:

    /Users/zimingfang/.pyenv/versions/3.6.4/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    <ipython-input-54-09fd7cf9e9a0>:52 call  *
        model_output = self.last_dense(concatenated)

    NameError: name 'concatenated' is not defined


In [52]:
results = text_model.evaluate(test_data)
print(results)

[-52.69684982299805, 0.0]
