# Look for synonyms

In [None]:
import pandas as pd
from elasticsearch import Elasticsearch, helpers, exceptions
from elasticsearch.client import IndicesClient
import math
es = Elasticsearch()
INDEX_NAME = "movie_reviews_4_10_5_no_stemming_shingle_max4"
FIELDS = ['movielens_ids','review','tags']
import csv
import pickle

## Functions

In [None]:
def get_reviews_tv_from_tag(tag):
    '''To get term vectors of reviews given tag
    :param tag : tag text
    :param multi_words : True if tag is multiwords, False if single word
    :return : term_vectors for all movie reviews related to the tag
    '''
    #To check if the tag is multiwords
    split_tag= tag.replace('-',' ').split()
    len_tag = len(split_tag)
    multi_words = True if len_tag>1 else False
                                  
    tag = tag.lower()
    res = es.search(index=INDEX_NAME, body={'query': {'match_phrase': {'tags':tag}}},_source=False, size = 10000)['hits']
    
    term_vectors = {}
    for item in res['hits']:
        tv = es.termvectors(index=INDEX_NAME, id=item['_id'], fields='review', term_statistics=True).get('term_vectors',{}).get('review',{}).get('terms',{})
        term_vectors[item['_id']] = tv
    
    return term_vectors,multi_words #Return unique reviews, as some written same reviews on different movies such as 'good', 'nice'

def get_field_statistics(fields_list):
    '''
    Function to get field statistics, from non empty field
    '''
    field_statistic_dict = {}
    
    for field in fields_list:
        
        # Get random entity whith field = field is not empty, to get the field statistic
        
        body1={
          "query": {
            "bool": {
              "filter": {
                "exists": {
                  "field": field
                }
              },
              "must_not": {
                "term": {
                  "test.keyword": ""
                }
              }
            }
          }
        }
       
        res = es.search(index = INDEX_NAME, body=body1,_source=False,size=1)
        hits = res.get('hits',{}).get('hits',{})
        ent_id = hits[0]['_id'] if len(hits) > 0 else None

        field_statistic_dict[field] = es.termvectors(index = INDEX_NAME, id = ent_id,term_statistics=True, fields = field)['term_vectors'].get(field,{}).get('field_statistics',{})
    return(field_statistic_dict)


## log likelihood calculation

Log-likelihood calculation (http://ucrel.lancs.ac.uk/llwizard.html): for a given tag and for each term that appears in reviews retrieved for that tag For a given (tag, term) pair:
- Corpus 1: reviews retrieved for tag
  - Frequency of word: number of reviews retrieved for tag that contain term
  - Corpus size: number of reviews retrieved for tag
- Corpus 2: all reviews
  - Frequency of word: number of documents that contain term
  - Corpus size: number of documents in the index
- Sort tags by LL


In [None]:
def count_log_likelihood(a,b,c,d):
    '''
    Log-likelihood calculation (http://ucrel.lancs.ac.uk/llwizard.html): for a given tag and for each term that appears in reviews retrieved for that tag
    :param a = number of reviews retrieved for tag that contain term
    :param c = number of reviews retrieved for tag
    :param b = number of documents that contain term
    :param d = number of documents in the index
    
    '''
    E1 = c*(a+b) / (c+d)
    
    E2 = d*(a+b) / (c+d)
   
    ll = 2*((a*math.log(a/E1)) + (b*math.log(b/E2)))

    return ll

def get_ll(a_b_parameters, c, d):
    '''
    To get log likelihood score for all pairs of (tag,term) and return sorted by highst score
    '''
    ll_scores = {}
    for term,param in a_b_parameters.items():
        ll_scores[term] = count_log_likelihood(param['a'],param['b'],c,d)
    return {k: v for k, v in sorted(ll_scores.items(), key=lambda item: item[1],reverse=True)}


def get_parameters (tag,reviews_tv):
    '''
    Get parameter a, b,c,d for each (tag,term) pair
    :param tag : original tag text
    :tag_type  : 'single word' or 'multi words'    
    '''
    #reviews_tv = get_reviews_tv_from_tag(tag,multi_words)

    a_b_parameters = {}
    for doc, item in reviews_tv.items():
        for term, tv in item.items():
            key = (tag,term)
            if key in a_b_parameters:
                a_b_parameters[key]['a'] = a_b_parameters[key]['a'] + 1
            else:
                a_b_parameters[key] = {'a':1, 'b':tv['doc_freq']}
    return a_b_parameters

def get_sorted_terms (tag):
    '''
    Wrapper function to get sorted term for tag
    '''

    reviews_tv,multi_words = get_reviews_tv_from_tag(tag)
    
    c = len(reviews_tv) #Number of reviews hits
    d = get_field_statistics(['review'])['review']['doc_count']
    
    #If number of reviews >=10
    if c >= 10:
        a_b_parameters= get_parameters(tag,reviews_tv)
        sorted_ll = get_ll(a_b_parameters, c, d)
        
    #If number of reviews < 10, return empty dictionary
    else:
        sorted_ll = {}
    return sorted_ll,multi_words

## Get All Tags

In [None]:
#Get all Tags
tags = pd.read_csv('Data/ml-20m/tags.csv')[['movieId','tag']]
tags.dropna(inplace=True) # There are 16 tags with NAN values, drop the rows
tags['tag'] = [str(x).lower() for x in tags['tag']] 
all_tags = list(set(tags['tag'])) 
len(all_tags)

#Note : some tags might not be in the index, since not all movielens are mapped (up till now is 61%), 
#so might be there are tags assigned to movies that are not mapped yet
#Total unique tags 35172 (lowered case),  total tags available in index if length of review limited to min 3 words = 14091 in index name 'movie_reviews_4_10_2',
#if length of review limited to min 6 words, only 7368 tags available in index

In [None]:
#As not all tags in index, so retrieve all tags available in index which has minimum 10 hits (10 reviews)
tags_not_in_index = []
tags_in_index = []
for i,tag in enumerate(all_tags):
    tag = tag.lower()
    res = es.search(index=INDEX_NAME, body={'query': {'match_phrase': {'tags':tag}}},_source=False, size = 10)['hits']
    
    if len(res['hits'])==0:
        tags_not_in_index.append(tag)
        
    elif len(res['hits']) == 10:
        tags_in_index.append(tag)

In [None]:
len(tags_in_index)

## Get the sorted terms based on LL

- Some get related words :
  - 'sadistic' : 'tarantino' 'django' 'quentin','jami','foxx'
  - 'cute alien'  : 'et','sai','can','what','i'
  - 'powerful' : hopkin, foster, nicholson, jodi


- Some doesnt':
  - 'misscariage of justice' : version, us, better, than
  - 'child sacrifice' : love, movi
  - 'dysfunctional family' : i, hoe, funni, great


# Return top n terms based on LL for each all tags in index (for review)

In [None]:
num_tag= len(tags_in_index)
sorted_terms_tag_multi_shingle_max2 = {}
sorted_terms_tag_single_shingle_max2 = {}

out_file1 = open("Data/sorted_terms/top_30_terms_per_tag(above_5)_multi_shingles_max2.tsv", 'wt')
out_file2 = open("Data/sorted_terms/top_30_terms_per_tag(above_5)_single_shingles_max2.tsv", 'wt') 

#with open("Data/sorted_terms/top_5_terms_per_tag(above_5).tsv", 'wt') as out_file:
tsv_writer1 = csv.writer(out_file1, delimiter='\t')
tsv_writer2 = csv.writer(out_file2, delimiter='\t')
    
for i,tag in enumerate(tags_in_index):
    sorted_terms,multi_words = get_sorted_terms(tag)
    if sorted_terms!={}:
        keys = list(sorted_terms.keys())[0:30] # Return top-30
        if multi_words == True:
            tsv_writer1.writerow([tag] + ['('+str(x[1])+':'+str(sorted_terms[x])+')' for x in keys])
            sorted_terms_tag_multi_shingle_max2[tag] = [(x[1],sorted_terms[x]) for x in keys]
        else:
            tsv_writer2.writerow([tag] + ['('+str(x[1])+':'+str(sorted_terms[x])+')' for x in keys])
            sorted_terms_tag_single_shingle_max2[tag] = [(x[1],sorted_terms[x]) for x in keys]
    
    if (i+1)%100 == 0:
        print(str(i+1),' done from ', num_tag)

out_file1.close()
out_file2.close()

In [None]:
#save file to pickle

filename = 'Data/sorted_terms/sorted_terms_tag_multi_shingle_max2_top30.pkl'
outfile = open(filename,'wb')
pickle.dump(sorted_terms_tag_multi_shingle_max2,outfile)
outfile.close()

filename = 'Data/sorted_terms/sorted_terms_tag_single_shingle_max2_top30.pkl'
outfile = open(filename,'wb')
pickle.dump(sorted_terms_tag_single_shingle_max2,outfile)
outfile.close()