### SUMMARY
those rare vocab has the following properties:
1. person name
2. start with #


In [20]:
import os
import json
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pymagnitude import *
import spacy

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
DATA_DIR = './meta_wstyle/data_mid_clean'
FILES = [('TEST_CAPTIONS_flickr8k_1_cap_per_img_5_min_word_freq.json', 'TEST_ID_flickr8k_1_cap_per_img_5_min_word_freq.json'),
         ('VAL_CAPTIONS_flickr8k_1_cap_per_img_5_min_word_freq.json', 'VAL_ID_flickr8k_1_cap_per_img_5_min_word_freq.json'),
         ('TRAIN_CAPTIONS_flickr8k_1_cap_per_img_5_min_word_freq.json', 'TRAIN_ID_flickr8k_1_cap_per_img_5_min_word_freq.json')]
WORD_MAP = 'WORDMAP_flickr8k_1_cap_per_img_5_min_word_freq.json'
ORIG_INPUT = './ig_json/mid_clean.json'

In [3]:
def read_json(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    return data

In [4]:
word_map = read_json(f'{DATA_DIR}/{WORD_MAP}')
rev_word_map = {v: k for k, v in word_map.items()}
orig_data = read_json(ORIG_INPUT)

In [57]:
len(word_map)

8552

In [5]:
orig_data_map = dict()

for id_dict in orig_data['images']:
    _id = id_dict['filename']
    before_tokens = id_dict['sentences'][0]['tokens']
    orig_data_map[_id] = before_tokens
    
print(len(orig_data_map))

122200


### 1. Combine Tokens before and after Process

In [6]:
all_data = dict()

remove_tokens = ['<start>', '<end>', '<pad>']
for captions_fn, id_fn in FILES:
    captions_path = f'{DATA_DIR}/{captions_fn}'
    id_path = f'{DATA_DIR}/{id_fn}'
    captions_data = read_json(captions_path)
    id_data = read_json(id_path)
    
    for seq, _id in zip(captions_data, id_data):
        after_tokens = [rev_word_map[s] for s in seq]
        after_tokens = [i for i in after_tokens if i not in remove_tokens]
        all_data[_id] = {'after_tokens': after_tokens}

In [7]:
for _id, id_dict in all_data.items():
    before_tokens = orig_data_map[_id]
    all_data[_id]['before_tokens'] = before_tokens

In [8]:
all_data['599527709_@_975428366157634456_599527709']

{'after_tokens': ['relaxing', 'in', 'the', 'sun', 'at', "grandma's"],
 'before_tokens': ['relaxing', 'in', 'the', 'sun', 'at', "grandma's"]}

In [9]:
len(all_data)

122200

### 2. Find out no. of Captions with UNK

In [10]:
all_data['187621810_@_958533411454864566_187621810']

{'after_tokens': ['<unk>', 'hits', 'the', '<unk>', 'show'],
 'before_tokens': ['eli', 'hits', 'the', 'gadget', 'show']}

In [13]:
unk_counter = Counter()
unk_captions = 0

for _id, id_dict in all_data.items():
    before_tokens = id_dict['before_tokens']
    after_tokens = id_dict['after_tokens']
    unk_words = [b_token for b_token, a_token in zip(before_tokens, after_tokens) if a_token == '<unk>']
    if len(unk_words) != 0:
        unk_captions += 1
        unk_counter.update(unk_words)
    
print(f'captions with unk: {unk_captions} / {len(all_data)} ({unk_captions / len(all_data) * 100})')

captions with unk: 33527 / 122200 (27.43617021276596)


In [15]:
len(unk_counter)

20950

In [23]:
unk_counter

Counter({'stables': 2,
         '206': 4,
         '#wishes': 1,
         'destroys': 4,
         'grail': 1,
         '#fatkidproblems': 1,
         'gouda': 2,
         'juliet': 5,
         "andy's": 1,
         'cages': 5,
         'eli': 2,
         'gadget': 2,
         'simplest': 5,
         'bleh': 2,
         '#loveofmylife': 3,
         'tulle': 4,
         'uniforms': 3,
         'possession': 4,
         '#numovement': 1,
         'attraction': 3,
         '#happyholidays': 2,
         'managers': 2,
         'woes': 4,
         'day-': 1,
         'artistry': 5,
         'passionate': 4,
         'cobblestone': 3,
         'entryway': 2,
         '#zucchini': 2,
         'abbey': 3,
         'former': 5,
         'promotion': 5,
         'chappy': 2,
         'scratches': 5,
         'dukes': 4,
         'backlog': 3,
         'glutes': 2,
         'loaves': 4,
         'nikes': 2,
         '-tap': 2,
         'details-': 2,
         '#grimmhouse': 1,
         '204': 3,
 

In [22]:
for i, (rare_word, freq) in enumerate(unk_counter.items()):
    if i == 5: break
    print(f'{rare_word} ({freq})')

stables (2)
206 (4)
#wishes (1)
destroys (4)
grail (1)


In [17]:
cnt_counter = Counter()
for _, cnt in unk_counter.items():
    cnt_counter.update([cnt])

In [18]:
cnt_counter

Counter({2: 5423, 4: 1733, 1: 9850, 5: 1193, 3: 2751})

### 3. A Deeper Look into Those Rare Word

### 3a. How many of them are person names?

In [26]:
'#alex'.replace('#', '')

'alex'

In [24]:
nlp = spacy.load('en_core_web_sm')

In [None]:
entities = dict()

for rare_word, freq in unk_counter.items():
    if rare_word.startswith('#'):
        rare_word = rare_word.replace('#', '')
    spacy_word = nlp(rare_word)
    
    ents = spacy_word.ents
    if len(ents) == 1 and ents[0]

### 4. Map to Closest Closed Vocab

In [32]:
vector = Magnitude('/home/alex/.magnitude/glove.twitter.27B.200d.magnitude')
vocab_ls = list(word_map.keys())
#vector.most_similar_to_given(word, word_ls)

In [47]:
def find_closest_word(word, vocab_ls, vector):
    sim = vector.similarity(word, vocab_ls)
    arg_idxs = np.array(sim).argsort()[-1:][::-1]
    
    ans_word, score = vocab_ls[arg_idxs[0]], sim[arg_idxs[0]]
    if score < 0.55:
        return None
    else:
        return ans_word

In [50]:
solve_cnt = 0
for i, (rare_word, freq) in enumerate(unk_counter.items()):
    close_word = find_closest_word(rare_word, vocab_ls, vector)
    
    if close_word is not None:
        solve_cnt += freq

In [51]:
solve_cnt / sum(unk_counter.values())

0.5091287100320222

### 5. How many vocab start with #?

In [54]:
tgt_words = [(word, freq) for word, freq in word_map.items() if word.startswith('#')]

In [55]:
len(tgt_words)

610

In [56]:
sum([freq for word, freq in tgt_words])

3099529