In [2]:
'''
    Read the dataset from tsv file.
'''
import csv

data = {}

with open("./data/tgif-v1.0.tsv") as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for idx, row in enumerate(rd):
        data[idx] = ({ 'url': row[0], 'sentence': row[1]})


In [11]:
data[0]

{'url': 'https://38.media.tumblr.com/9f6c25cc350f12aa74a7dc386a5c4985/tumblr_mevmyaKtDf1rgvhr8o1_500.gif',
 'sentence': 'a man is glaring, and someone with sunglasses appears.'}

In [13]:
'''
    Removes stop words from sentences
'''

# install nltk if not present
try:
    import nltk
    import ssl

    nltk.download('stopwords')
except ImportError as e:
    !pip3 install nltk
    import nltk
    nltk.download('stopwords')

from nltk.corpus import stopwords

print(set(stopwords.words('english')))
english_stop_words = stopwords.words('english')

for key, value in data.items():
    value['sentence'] = ' '.join(word for word in value['sentence'].split() if word not in english_stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
{'both', 'once', "aren't", 'from', 'she', 'off', "that'll", 'between', 'doesn', 'very', 'didn', 'out', 'the', 'had', 're', 'nor', 'they', 'shan', 'doing', "wouldn't", 'such', 'other', 'we', 'there', 'why', 'these', "don't", 's', "hadn't", 'it', 'but', 'all', 'will', 'ain', 'hasn', 'now', 'so', 'too', 'its', 'some', 'a', "wasn't", "weren't", 'did', 'wasn', 'itself', 'who', 'am', "you'd", 'and', 'same', 'this', 'further', 'because', 'up', 'was', 'through', 've', 'any', 'does', 'herself', 'ours', 'until', 'few', 'while', 'when', 'only', 'yourselves', "didn't", 'd', 'into', 'an', 'just', 'isn', 'yourself', 'being', 'than', "should've", 'which', 'is', 'before', 'most', 'here', 'can', 'couldn', 'whom', 'were', 'again', 'll', 'over', 'themselves', 'hadn', 'by', 'on', 'then', 'have', 'm', 'our', 'my', "shouldn't", 'o', 'against', 'ma', 'won', 'what', 'with', 'shouldn', 'after'

In [15]:
'''
    Removes punctuation from sentences
'''
import re

for key, value in data.items():
    value['sentence'] = re.sub(r'[^\w\s]', '', value['sentence'])


In [21]:
'''
  Create reverse index for each word to its document id for easier retrieval
'''
from nltk.tokenize import word_tokenize
nltk.download('punkt')

index = {}
for key, value in data.items():
  tokens = word_tokenize(value['sentence'])
  for token in tokens:
    doc_ids = index.get(token, set())
    doc_ids.add(key)
    index[token] = doc_ids

print("Total keys in index: " + str(len(index.keys())))
print("Top 50: ")
print(list(index.items())[:50])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Total keys in index: 12148
Top 50: 
[('man', {0, 2, 4, 6, 7, 9, 11, 12, 16, 17, 20, 23, 30, 31, 32, 36, 39, 40, 46, 49, 50, 53, 57, 62, 63, 70, 71, 72, 78, 79, 80, 81, 84, 85, 94, 99, 100, 103, 104, 105, 107, 111, 114, 116, 120, 122, 127, 128, 140, 144, 145, 150, 154, 156, 157, 159, 162, 163, 164, 169, 175, 182, 186, 187, 188, 191, 195, 198, 199, 200, 201, 206, 207, 216, 222, 226, 227, 229, 231, 233, 241, 242, 243, 246, 249, 250, 251, 254, 255, 256, 257, 261, 267, 274, 275, 283, 288, 289, 291, 293, 296, 298, 299, 303, 306, 310, 311, 312, 316, 317, 321, 323, 338, 342, 344, 346, 348, 351, 352, 353, 355, 360, 362, 364, 367, 371, 375, 376, 378, 383, 384, 387, 389, 390, 392, 393, 394, 397, 407, 408, 412, 414, 416, 420, 422, 430, 431, 432, 437, 447, 448, 450, 452, 453, 454, 459, 461, 466, 467, 468, 469, 472, 473, 478, 479, 482, 489, 492, 497, 498, 500, 503, 507, 508, 513, 514, 517, 

In [14]:
'''
  Code fragment for displaying GIF
'''
from IPython.display import IFrame

# GIF link used as IFrame (need to change path as needed)
iframe_url = data[0]['url']

# resized output IFrame
IFrame(src=iframe_url, width=600, height=550)

'\n  Code fragment for displaying GIF\n'