In [1]:
import tensorflow as tf 
import tensorflow_hub as hub 
from tensorflow.keras import layers 
import bert 
import re 
# re — Regular expression operations
import math
import csv
import pandas as pd                     
import cv2 as cv 
from PIL import Image
from tensorflow.keras import losses
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
data = pd.read_csv("/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/tgif-v1.0.tsv", sep='\t')
data.isnull().values.any()
data.shape

(125782, 2)

In [3]:
# data.head()

In [4]:
gif_links = list(data.y.values)
raw_tweets = list(data.x.values)

In [5]:
# gif_links

In [6]:
# raw_tweets

# Tweet Pre-Process 

## Remove Special Char

In [7]:
# definition for function for removing html tags 
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [8]:
# definition for function for remove any punctuations and special characters
def preprocess_text(raw_tweaet):
    # Removing html tags
    tweet = remove_tags(raw_tweaet)
    # Removing html tags
    tweet = re.sub('[^a-zA-Z]', '', tweet)
    # Removing html tags
    tweet = re.sub(r"\s+[a-zA-Z]\s+", ' ', tweet)
    # Removing multiple spaces
    tweet = re.sub(r'\s+', ' ', tweet)
    return tweet

In [9]:
# run the preprocess_text function to clean tweets list 
tweets = [] 
for tweet in raw_tweets[:50]:
    tweets.append(preprocess_text(tweet))

## Tokenizing 

In [10]:
# Create a tokenizer 
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
# .numpy(): converts a tensor object into an numpy.ndarray
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [11]:
# Definition for function for convert tweet to ids 
def tokenize_tweets(text_tweets):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_tweets))

In [12]:
# run the tokenize_tweets on tweets 
tokenized_tweets = [tokenize_tweets(tweet) for tweet in tweets]

# GIF Pre-Process

In [13]:
import requests
# Requests is an elegant and simple HTTP library for Python
import os 
# os — Miscellaneous operating system interfaces¶
os.chdir('/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/gifs')
# !pwd

In [14]:
filenames = pd.read_csv("/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/filenames.csv", header=None)
filenames = filenames.iloc[1:]
filenames = filenames.iloc[:,1]

In [15]:
# filenames

In [16]:
# https://programmersought.com/article/1235305180/
def getImageVar(image):
    img2gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    imageVar = cv.Laplacian(img2gray, cv.CV_64F).var()
    return imageVar

In [17]:
def get_avg_sharpness(path_to_gif):
    gif_sharpness = []
    capture = cv.VideoCapture(path_to_gif)
    pos_frame = capture.get(cv.CAP_PROP_POS_FRAMES)

    while True: 
        isTrue, frame = capture.read()
        gif_sharpness.append(getImageVar(frame))
        if capture.get(cv.CAP_PROP_POS_FRAMES) == capture.get(cv.CAP_PROP_FRAME_COUNT):
            # If the number of captured frames is equal to the total number of frames,
            # we stop
            break
    return sum(gif_sharpness) / len(gif_sharpness)

In [37]:
# In GIF files, each frame has its own duration. So there is no general fps for a GIF file. 
sharpness = []
for gif_file_path in filenames[:50]: 
#     cap=cv2.VideoCapture("gif_file_path")
#     fps = cap.get(cv2.CAP_PROP_FPS)
#     gif_obj = Image.open(gif_file_path)
    sharpness.append(tf.math.log(get_avg_sharpness(gif_file_path)))

In [38]:
import numpy 
print(numpy.min(sharpness))
print(numpy.max(sharpness))

4.940601699139596
8.92384305453336


# Prerparing Data For Training


In [39]:
x_train = tokenized_tweets
y_train = sharpness

In [40]:
# len(x_train)

In [41]:
tweets_with_len = [[tweet, y_train[i], len(tweet)]
                 for i, tweet in enumerate(x_train)]

In [42]:
#sort the data by tweet length 
tweets_with_len.sort(key=lambda x: x[2])
#remove the tweet length attribute from dataset 
sorted_tweets_labels = [(tweet_lab[0], tweet_lab[1]) for tweet_lab in tweets_with_len]

In [43]:
# sorted_tweets_labels

In [44]:
# convert the sorted dataset into a TensorFlow 2.0-compliant input dataset shape
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_tweets_labels, output_types=(tf.int32, tf.float32))


In [45]:
BATCH_SIZE = 10 
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [46]:
TOTAL_BATCHES = math.ceil(len(sorted_tweets_labels) / BATCH_SIZE)
# TEST_BATCHES = TOTAL_BATCHES // 10
TEST_BATCHES = 1
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [47]:
# next(iter(batched_dataset))

# Creating the Model


In [48]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200

In [49]:
#GlobalAveragePooling1D layer returns a fixed-length output vector for each example by averaging over the sequence dimension.
#allows the model to handle input of variable length, in the simplest way possible

model = tf.keras.Sequential([
    layers.Embedding(VOCAB_LENGTH + 1, EMB_DIM),
    layers.Dropout(0.2),    
    layers.GlobalAveragePooling1D(), 
    layers.Dropout(0.2),
    layers.Dense(1, activation='linear') #set activation to linear for regression
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 200)         6104600   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 200)         0         
_________________________________________________________________
global_average_pooling1d_1 ( (None, 200)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 6,104,801
Trainable params: 6,104,801
Non-trainable params: 0
_________________________________________________________________


In [50]:
epochs = 100
history = model.fit(
    train_data,
    validation_data=test_data,
    epochs=epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100


Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [51]:
print(history.history.keys())

dict_keys(['loss', 'mae', 'val_loss', 'val_mae'])


In [52]:
fig = go.Figure()
fig.add_trace(go.Scattergl(y=history.history['loss'],
                    name='Train'))

fig.add_trace(go.Scattergl(y=history.history['val_loss'],
                     name='Valid'))

fig.update_layout(height=500, width=700,
                  xaxis_title='Epoch',
                  yaxis_title='Loss')
fig.show()

In [53]:
results = model.evaluate(test_data)

print(results)

[10.330839157104492, 2.746983528137207]


In [54]:
# Save the model 

import joblib
model.save("/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/GIF models/avg_sharpness_prediction_model.h5")