In [20]:
import tensorflow as tf 
import tensorflow_hub as hub 
from tensorflow.keras import layers 
import bert 
import re 
# re — Regular expression operations
import math
import csv
import pandas as pd                     
import cv2 as cv 
from PIL import Image
from tensorflow.keras import losses
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
data = pd.read_csv("/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/tgif-v1.0.tsv", sep='\t')
data.isnull().values.any()
data.shape

(125782, 2)

In [3]:
# data.head()

In [4]:
gif_links = list(data.y.values)
raw_tweets = list(data.x.values)

In [5]:
# gif_links

In [6]:
# raw_tweets

# Tweet Pre-Process 

## Remove Special Char

In [7]:
# definition for function for removing html tags 
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [8]:
# definition for function for remove any punctuations and special characters
def preprocess_text(raw_tweaet):
    # Removing html tags
    tweet = remove_tags(raw_tweaet)
    # Removing html tags
    tweet = re.sub('[^a-zA-Z]', '', tweet)
    # Removing html tags
    tweet = re.sub(r"\s+[a-zA-Z]\s+", ' ', tweet)
    # Removing multiple spaces
    tweet = re.sub(r'\s+', ' ', tweet)
    return tweet

In [9]:
# run the preprocess_text function to clean tweets list 
tweets = [] 
for tweet in raw_tweets[:10]:
    tweets.append(preprocess_text(tweet))

## Tokenizing 

In [10]:
# Create a tokenizer 
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
# .numpy(): converts a tensor object into an numpy.ndarray
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [11]:
# Definition for function for convert tweet to ids 
def tokenize_tweets(text_tweets):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_tweets))

In [12]:
# run the tokenize_tweets on tweets 
tokenized_tweets = [tokenize_tweets(tweet) for tweet in tweets]

# GIF Pre-Process

In [13]:
import requests
# Requests is an elegant and simple HTTP library for Python
import os 
# os — Miscellaneous operating system interfaces¶
os.chdir('/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/gifs')
# !pwd

In [14]:
def gif_downloader(image_urls, status=[], filenames = []):
    
    for index, img in enumerate(image_urls):
        # We can split the file based upon / and extract the last split within the python list below:
        file_name = img.split('/')[-1]
        #print("fThis is the file name: {file_name}")
        filenames.append(file_name) 
        # Now let's send a request to the image URL:
        r = requests.get(img, stream=True)
        # We can check that the status code is 200 before doing anything else:
        if r.status_code == 200:
            # This command below will allow us to write the data to a file as binary:
            with open(file_name, 'wb') as f:
                for chunk in r:
                    f.write(chunk)
            status.append(True)
        else:
            # We will write all of the images back to the broken_images list:
            status.append(False)
    return filenames, status

In [15]:
# def load_gif(image_path, sess):
#     image = tf.io.read_file(image_file)
#     image = tf.io.decode_gif(image)
#     return sess.run(image)

In [16]:
download_status = []
filenames = []
filenames, downlod_status = gif_downloader(gif_links[:10], status=download_status)
# if any gif was not downloaded successfully 
print(any(status == False for status in download_status))

False


In [17]:
# filenames

In [36]:
def get_avg_contrast(path_to_gif):
    gif_contrast = []
    capture = cv.VideoCapture(path_to_gif)
    pos_frame = capture.get(cv2.CAP_PROP_POS_FRAMES)

    while True: 
        isTrue, frame = capture.read()
        img_grey = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
        gif_contrast.append(img_grey.std())
        if capture.get(cv2.CAP_PROP_POS_FRAMES) == capture.get(cv2.CAP_PROP_FRAME_COUNT):
            # If the number of captured frames is equal to the total number of frames,
            # we stop
            break
    return sum(gif_contrast) / len(gif_contrast)

In [37]:
# In GIF files, each frame has its own duration. So there is no general fps for a GIF file. 
contrast = []
for gif_file_path in filenames[:10]: 
#     cap=cv2.VideoCapture("gif_file_path")
#     fps = cap.get(cv2.CAP_PROP_FPS)
#     gif_obj = Image.open(gif_file_path)
    contrast.append(get_avg_contrast(gif_file_path))

In [38]:
contrast

[68.42556184690109,
 58.05193036668147,
 63.387818026825364,
 50.30206960661047,
 62.45907835278932,
 46.81373933565822,
 81.60186270083663,
 63.94421476526131,
 50.60991991370668,
 66.09166546706905]

# Prerparing Data For Training


In [39]:
x_train = tokenized_tweets
y_train = contrast

In [40]:
# len(x_train)

In [41]:
tweets_with_len = [[tweet, y_train[i], len(tweet)]
                 for i, tweet in enumerate(x_train)]

In [42]:
#sort the data by tweet length 
tweets_with_len.sort(key=lambda x: x[2])
#remove the tweet length attribute from dataset 
sorted_tweets_labels = [(tweet_lab[0], tweet_lab[1]) for tweet_lab in tweets_with_len]

In [43]:
# sorted_tweets_labels

In [44]:
# convert the sorted dataset into a TensorFlow 2.0-compliant input dataset shape
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_tweets_labels, output_types=(tf.int32, tf.float32))


In [45]:
BATCH_SIZE = 10 
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [46]:
TOTAL_BATCHES = math.ceil(len(sorted_tweets_labels) / BATCH_SIZE)
# TEST_BATCHES = TOTAL_BATCHES // 10
TEST_BATCHES = 1
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [47]:
# next(iter(batched_dataset))

# Creating the Model


In [48]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200

In [49]:
#GlobalAveragePooling1D layer returns a fixed-length output vector for each example by averaging over the sequence dimension.
#allows the model to handle input of variable length, in the simplest way possible

model = tf.keras.Sequential([
    layers.Embedding(VOCAB_LENGTH + 1, EMB_DIM),
    layers.Dropout(0.2),    
    layers.GlobalAveragePooling1D(), 
    layers.Dropout(0.2),
    layers.Dense(1, activation='linear') #set activation to linear for regression
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 200)         6104600   
_________________________________________________________________
dropout (Dropout)            (None, None, 200)         0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 200)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 6,104,801
Trainable params: 6,104,801
Non-trainable params: 0
_________________________________________________________________


In [50]:
epochs = 100
history = model.fit(
    train_data,
    validation_data=test_data,
    epochs=epochs)

Epoch 1/100


ValueError: Expect x to be a non-empty array or dataset.

In [None]:
print(history.history.keys())

In [None]:
fig = go.Figure()
fig.add_trace(go.Scattergl(y=history.history['loss'],
                    name='Train'))

fig.add_trace(go.Scattergl(y=history.history['val_loss'],
                     name='Valid'))

fig.update_layout(height=500, width=700,
                  xaxis_title='Epoch',
                  yaxis_title='Loss')
fig.show()

In [None]:
results = model.evaluate(test_data)

print(results)