# Simple Template-Based Approach

In [1]:
import locale
locale.setlocale(locale.LC_ALL, '')

'en_US.UTF-8'

In [2]:
import random
import pandas as pd
import numpy as np
import csv
import spacy
sp = spacy.load('en_core_web_sm')
import nltk
from nltk.tokenize import word_tokenize
import re
nltk.download('tagsets')

from nltk.data import load
from nltk.corpus import conll2000
from nltk.chunk import regexp
import pickle
import math

[nltk_data] Downloading package tagsets to /Users/renny/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


# Get similar tags

In [None]:
def count_log_likelihood(a,b,c,d):
    '''
    Log-likelihood calculation (http://ucrel.lancs.ac.uk/llwizard.html): for a given tag and for each term that appears in reviews retrieved for that tag
    :param a = number of reviews retrieved for tag that contain term
    :param c = number of reviews retrieved for tag
    :param b = number of documents that contain term
    :param d = number of documents in the index
    
    '''
    E1 = c*(a+b) / (c+d)
    
    E2 = d*(a+b) / (c+d)
   
    ll = 2*((a*math.log(a/E1)) + (b*math.log(b/E2)))

    return ll

def dump_pickle(filename,obj):
    with open(filename,'wb') as f:
        pickle.dump(obj,f)



In [None]:
def get_sorted_candidate(dict_candidates,d):
    ll_scores = {}
    for item,value in dict_candidates.items():
        ll_scores[item] = count_log_likelihood(value['a'],value['b'],value['c'],d)
    return {k: v for k, v in sorted(ll_scores.items(), key=lambda item: item[1],reverse=True)}

In [43]:
# Load only train split of tags for generating synthetic training data. If want to generate synonym for all tags, load all tags
train_list_of_tags = load_pickle('data/dataset/splitted_tags/train_tags.pkl')

In [87]:
# From list of unique pair of tag-movieID, filter only tags in the filetered tags

dict_tag_movieId = {}
dict_movieId_tag = {}
#d = 0
for tag, movieId in zip(list(movieId_tag['tag']), list(movieId_tag['movieId'])):
    #if tag in cleaned_list_of_tags:
    if tag in train_list_of_tags: # filter to only tags in train split    
        #d = d+1
        if tag in dict_tag_movieId:
            dict_tag_movieId[tag].append(movieId)
        else:
            dict_tag_movieId[tag] = [movieId]
        if movieId in dict_movieId_tag:
            dict_movieId_tag[movieId].append(tag)
        else:
            dict_movieId_tag[movieId] = [tag]
                

In [88]:
parameters_all_tags = {}

#for tag in cleaned_list_of_tags:
for tag in train_list_of_tags:
    param_candidate_tags={}
    movielens_id_hits = dict_tag_movieId[tag]
    c = len(movielens_id_hits)
    candidate_tags = {} #candidate tags , with value = number of movielens retrieved containing that cand tag
    
    for movieId in movielens_id_hits:
        for cand_tag in dict_movieId_tag[movieId]: #candidate_tag will only appear once in one movie_lens ID
            if cand_tag != tag: #to filtered out the 'tag' itself
                if cand_tag not in candidate_tags:
                    candidate_tags[cand_tag] = {'a':1, 'b' : len(dict_tag_movieId[cand_tag]), 'c' : c}
                else:
                    candidate_tags[cand_tag]['a'] = candidate_tags[cand_tag]['a'] + 1 #count how many movielense which tagged by tag also tagged by candidate tag
                
    parameters_all_tags[tag] = candidate_tags
            

In [91]:
## Save top 10 synonyms in tsv

out_file1 = open("data/tags_synonym/top_10_synonym_train.tsv", 'wt')
tsv_writer1 = csv.writer(out_file1, delimiter='\t')
tsv_writer1.writerow(['Tag'] + ['synonym'+str(i) for i in range(10)])
sorted_synonyms={}
d = len(dict_movieId_tag)
for tag,value in parameters_all_tags.items():
    top_10 = list(get_sorted_candidate(value,d).items())[0:10]
    top_10 = [(x[0],(round(x[1], 2))) for x in top_10]
    
    tsv_writer1.writerow([tag] + top_10)
    sorted_synonyms[tag] = dict(top_10)
    
dump_pickle('data/tags_synonym/top_10_synonym_train.pkl',sorted_synonyms)

In [106]:
# Example

sorted_synonyms['monologue']

{'neo-noir': 25.02,
 'film noir': 21.09,
 'cruel': 20.97,
 'multiple storylines': 20.78,
 'frank miller': 20.55,
 'amazing': 19.5,
 'quentin tarantino': 18.67,
 'robert rodriguez': 18.42,
 'gratuitous violence': 18.42,
 'jessica alba': 18.2}