# Simple Template-Based Approach

In [1]:
import random
import pandas as pd
import numpy as np
import csv
import spacy
sp = spacy.load('en_core_web_sm')
import nltk
from nltk.tokenize import word_tokenize
import re
nltk.download('tagsets')
from nltk.data import load
from nltk.corpus import conll2000
import pickle

[nltk_data] Downloading package tagsets to /Users/renny/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


## 1. Load Parameters 

In [9]:
#with open('output/tagged_tags_dictionary_withsents_film.pkl','rb') as f:
with open('output/tags_available_in_index/tagged_tags_dictionary_withsents_film(available_in_index).pkl','rb') as f: ## Change to temporary file
    tagged_tags_dictionary = pickle.load(f)
    
#with open('output/mapped_tags_to_group_film','rb') as f:
with open('output/tags_available_in_index/mapped_tags_to_group_film(available_in_index).pkl','rb') as f:    
    mapped_tags_to_group = pickle.load(f)
    
with open('output/tags_available_in_index/mapped_tags_to_group_nonperson_film(available_in_index).pkl','rb') as f:
    mapped_tags_to_group_nonperson = pickle.load(f)

    
cleaned_list_of_tags = tagged_tags_dictionary.keys()

tags_dataframe = pd.DataFrame(tagged_tags_dictionary).T.reset_index().rename(columns={'index': 'Tags'})   

like_synonyms=["like","love","prefer","enjoy","are into"]
dont_like_synonyms=["dislike","don't like","don't prefer","hate","dislike","are not into"]

## 2 get_summary function 
is using basic template, with choice of replacing "like" and "don't like"
1. You (don't) like {tag1} movies especially if they are {tag2} -- for ((tag1,+), (tag2,++)) or ((tag1,-),(tag2,--))
2. You (don't) like {tag1} movies especially if they are not {tag2} -- for ((tag1,++),(tag2,+)) or ((tag1,--),(tag2,-))
3. You (don't) like {tag1} movies unless they are {tag2} -- for ((tag1,+),(tag2,-)) or ((tag1,-),(tag2,+))
4. You (don't) like {tag1} movies if they are {tag2} -- for ((tag1,N),(tag2,+)) or ((tag1,N),(tag2,-))

In [11]:


### Basic_1 templates
def get_summary_basic_1(tag1,tag2):
    
    if len(tag1[1]) ==1:
        
        if len(tag2[1])==2:
            #1st template (+,++) or (-,--)
            text1 = random.choice(like_synonyms) if tag1[1]=='+' else random.choice(dont_like_synonyms)
            return "You {} [{}] movies especially if they are [{}].".format(text1,tag1[0],tag2[0])
            
        else:
            #4th template (N,+) or (N,-)
            if tag1[1] == 'N':
                text1 = random.choice(like_synonyms) if tag2[1]=='+' else random.choice(dont_like_synonyms)
                return "You {} [{}] movies if they are [{}].".format(text1,tag1[0],tag2[0])
    
            #3rd template (+,-) or (-,+)    
            else:
                    text1 = random.choice(like_synonyms) if tag1[1]=='+' else random.choice(dont_like_synonyms)
                    return "You {} [{}] movies unless they are [{}].".format(text1,tag1[0],tag2[0])
    
    #2nd template (++,+) or (--,-)
    else:
        text1 = random.choice(like_synonyms) if tag1[1]=='++' else random.choice(dont_like_synonyms)
        return "You {} [{}] movies especially if they are not [{}].".format(text1,tag1[0],tag2[0])
 

## 1.3 Get N preference statements
To generate n number of preference statement, with random tags and random preferences level

In [12]:
def generate_preference_statements(num_statements, tags, template_type=1): 
    '''
    param num_statements : number of statements to be generated
    param tags : list of tags. From this list, the tags for each statement will be randomly choosen
    '''
    list_preference_statements = []
    list_preference = []
    list_tags = []
    list_postags = []

    for i in range(num_statements):
        pair_tags = random.sample(tags,k=2)
        list_postags.append([tags_dataframe[tags_dataframe['Tags']==pair_tags[0]]['POS_Tags'].values[0],tags_dataframe[tags_dataframe['Tags']==pair_tags[1]]['POS_Tags'].values[0]])
        pair_preferences = random.choice(list_of_preference)
        if template_type == 1:
            pref_statement = get_summary_basic_1((pair_tags[0],pair_preferences[0]),(pair_tags[1],pair_preferences[1]))
        elif template_type == 2:
            pref_statement = get_summary_basic_2((pair_tags[0],pair_preferences[0]),(pair_tags[1],pair_preferences[1]))
        print (pref_statement)
        list_preference_statements.append(pref_statement)
        list_preference.append(pair_preferences)
        list_tags.append(pair_tags)
    return list_preference_statements, list_preference,list_tags, list_postags

## 1.4 Other function(s)

In [13]:
def save_to_file(title,list_preference_statements, list_preference,list_tags, list_postags):   
    '''
    Saving the generated statements into excel file
    '''
    # Write the generated sentences in a csv file for review
    list_tags = np.array(list_tags)
    list_preference = np.array(list_preference)
    list_postags = np.array(list_postags)
    
    pd.DataFrame({'Preference_statement': list_preference_statements,
                  'tag1': list_tags[0:,0],
                  'pos-tag1' : list_postags[0:,0],
                  'pref1': list_preference[0:,0],
                  'tag2': list_tags[0:,1],
                  'pos-tag2' : list_postags[0:,1],
                  'pref2': list_preference[0:,1]
    }).to_excel("output/"+title)

# 3. Generate Preference statements

In [14]:
list_of_preference = [('+','++'),('-','--'),('++','+'),('--','-'),('+','-'),('-','+'),('N','+'),('N','-')]

In [15]:
# Generate preference statements with all filtered tags
list_preference_statements, list_preference, list_tags, list_postags = generate_preference_statements(200,cleaned_list_of_tags,template_type=1)
save_to_file('tags_available_in_index/preference_basic_all.xlsx',list_preference_statements, list_preference,list_tags,list_postags)

You don't prefer [tear jerker] movies unless they are [bad script].
You don't prefer [drug use] movies if they are [deadpan].
You are not into [puzzling] movies unless they are [torture].
You like [madcap] movies especially if they are [afi 100].
You don't like [loyalty] movies especially if they are not [unlikable characters].
You prefer [serial killer] movies especially if they are [70mm].
You are into [ambition] movies unless they are [simon pegg].
You dislike [from the view of children] movies if they are [english].
You dislike [daniel auteuil] movies especially if they are [india].
You like [us history] movies unless they are [over-rated].
You are not into [camp] movies if they are [angels].
You like [watched 2006] movies especially if they are not [priest].
You like [women] movies especially if they are [gary cooper].
You are not into [school] movies especially if they are not [treasure].
You don't prefer [los angeles] movies if they are [illogical].
You dislike [based on a comic

In [16]:
# Generate preference statements with tags tagged as adjective
adjectives_tags = mapped_tags_to_group['adjective']
list_preference_statements, list_preference, list_tags, list_postags = generate_preference_statements(200,adjectives_tags, template_type=1)
save_to_file('tags_available_in_index/preference_adjective_basic.xlsx',list_preference_statements, list_preference,list_tags,list_postags)

You don't like [self-esteem] movies if they are [horrible].
You prefer [break-up] movies if they are [simplistic].
You don't like [nocturnal] movies if they are [surprisingly good].
You are not into [unoriginal] movies unless they are [fake].
You don't like [experimental] movies especially if they are not [futuristic].
You don't prefer [short] movies unless they are [futuristic].
You like [claustrophobic] movies if they are [retro-futuristic].
You like [usa] movies if they are [too violent].
You don't prefer [unintentionally funny] movies if they are [tongue-in-cheek].
You dislike [elegiac] movies unless they are [illogical].
You enjoy [pretentious] movies especially if they are [stupid].
You dislike [gruesome] movies if they are [upbeat].
You don't prefer [derivative] movies especially if they are not [foreign].
You prefer [formulaic] movies if they are [outdated].
You are not into [historical] movies especially if they are [upbeat].
You are into [silent] movies especially if they are

In [22]:
# Generate preference statements which tags tagged as other than nouns
non_noun_tags = []
for item in mapped_tags_to_group:
    if item !='adjective':
        non_noun_tags = non_noun_tags + mapped_tags_to_group[item]
        
list_preference_statements, list_preference, list_tags,list_postags = generate_preference_statements(500,non_noun_tags,template_type=1)
save_to_file('tags_available_in_index/preference_non_adjective_basic.xlsx',list_preference_statements, list_preference,list_tags,list_postags)

You dislike [neo noir] movies especially if they are not [teenagers].
You don't like [lovecraft] movies unless they are [innuendo].
You are not into [isolation] movies especially if they are not [hawaii].
You like [compassionate] movies unless they are [gerard depardieu].
You enjoy [first contact] movies if they are [cute girls].
You don't like [jessica biel] movies if they are [brainwashing].
You are into [second half was worse] movies especially if they are [tim robbins].
You love [retro] movies if they are [gary cole].
You like [aliens invasion] movies especially if they are not [great adaptations].
You are into [aids] movies especially if they are not [slow start].
You dislike [joseph gordon-levitt] movies if they are [chemistry between actors].
You are not into [subtitles] movies especially if they are not [off-beat comedy].
You prefer [jeremy irons] movies if they are [meryl streep].
You like [supernatural romance] movies especially if they are not [good acting].
You are into [aw

In [526]:
# Generate preference statements which tags tagged as other noun but not person entity
list_preference_statements, list_preference, list_tags,list_postags = generate_preference_statements(100,mapped_tags_to_group_nonperson['noun'], template_type=1)
save_to_file('preference_statement_non_person_noun_190320_b2.xlsx',list_preference_statements, list_preference,list_tags,list_postags)

You dislike movies about 'martial arts' if they are about 'fast food'.
You dislike movies about 'wealth' if they are about 'artist mind'.
You prefer movies about 'worth watching' especially if they are not about 'no chemistry'
You dislike movies about 'greed' unless they are about 'russian revolution'.
You are not into movies about 'writing' unless they are about 'russia'.
You don't like movies about 'mortality' if they are about 'good soundtrack'.
You enjoy movies about 'movie business' especially if they are about 'complex morality'.
You don't like movies about 'world war ii' if they are about 'knights'.
You dislike movies about 'hayley atwell' especially if they are about 'german perspective'.
You hate movies about 'metaphysics' especially if they are about 'dvd'.
You dislike movies about 'dialogue driven' unless they are about 'figure skating'.
You like movies about 'escapism' especially if they are about 'opposites attract'.
You dislike movies about 'soviet union' if they are abou

In [24]:
'star trek' in non_noun_tags

True

In [26]:
non_noun_tags.index('star trek')

1015

In [27]:
non_noun_tags[1015]

'star trek'