# Simple Template-Based Approach

In [2]:
import random
import pandas as pd
import numpy as np
import csv
import spacy
sp = spacy.load('en_core_web_sm')
import nltk
from nltk.tokenize import word_tokenize
import re
nltk.download('tagsets')
from nltk.data import load
from nltk.corpus import conll2000
import pickle

[nltk_data] Downloading package tagsets to /Users/renny/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [334]:
def load_pickle(filename):
    with open(filename,'rb') as f:
        obj = pickle.load(f)
    f.close()
    return obj

def dump_pickle(filename,obj):
    with open(filename,'wb') as f:
        pickle.dump(obj,f)


In [652]:
list_of_preference = [('+','++'),('-','--'),('++','+'),('--','-'),('+','-'),('-','+'),('N','+'),('N','-')]

## 1. Load Parameters 

In [650]:
#with open('output/tagged_tags_dictionary_withsents_film.pkl','rb') as f:
train_tags = load_pickle('data/dataset/splitted_tags/train_tags.pkl')
test_tags = load_pickle('data/dataset/splitted_tags/test_tags.pkl')
mapped_train_tags = load_pickle('data/dataset/splitted_tags/mapped_train_tags.pkl')
mapped_test_tags = load_pickle('data/dataset/splitted_tags/mapped_test_tags.pkl')
cleaned_train_tags = train_tags.keys()
cleaned_test_tags = test_tags.keys()
train_synonym_dictionary = load_pickle('data/tags_synonym/top_10_synonym_train.pkl') #list of synonym from train tags

#Load all tags and mapped all tags
all_tags = load_pickle('data/dataset/all_tags/tagged_tags_dictionary_withsents_film.pkl')
mapped_all_tags = load_pickle('data/dataset/all_tags/mapped_tags_to_group_film')
cleaned_all_tags = list(all_tags.keys())

like_synonyms_basic = ["like"]
dont_like_synonyms_basic = ["don't like"]
like_synonyms=["like","love","prefer","enjoy","are into", "would watch", "like to watch","like watching", "love to watch","love watching","prefer to watch","prefer watching","enjoy watching","are interested in"]
dont_like_synonyms=["dislike","don't like","hate","don't prefer","are not into", "wouldn't watch","dislike watching","don't like to watch","don't like watching","hate to watch","hate watching","don't prefer watching","don't prefer to watch","are not interested in"]
especially_synonyms = ["especially", "particularly"]
unless_synonyms = ["unless","except if"]

In [643]:
### Basic_1 templates
def get_summary_basic_1(tag1,tag2):
    
    if len(tag1[1]) ==1:
        
        if len(tag2[1])==2:
            #1st template (+,++) or (-,--)
            text1 = random.choice(like_synonyms_basic) if tag1[1]=='+' else random.choice(dont_like_synonyms_basic)
            return "You {} [{}] movies, especially if they are [{}].".format(text1,tag1[0],tag2[0])
            
        else:
            #4th template (N,+) or (N,-)
            if tag1[1] == 'N':
                text1 = random.choice(like_synonyms_basic) if tag2[1]=='+' else random.choice(dont_like_synonyms_basic)
                return "You {} [{}] movies if they are [{}].".format(text1,tag1[0],tag2[0])
    
            #3rd template (+,-) or (-,+)    
            else:
                    text1 = random.choice(like_synonyms_basic) if tag1[1]=='+' else random.choice(dont_like_synonyms_basic)
                    return "You {} [{}] movies, unless they are [{}].".format(text1,tag1[0],tag2[0])
    
    #2nd template (++,+) or (--,-)
    else:
        text1 = random.choice(like_synonyms_basic) if tag1[1]=='++' else random.choice(dont_like_synonyms_basic)
        return "You {} [{}] movies, especially if they are not [{}].".format(text1,tag1[0],tag2[0])
 

### Basic template adapted from the paper : 
1. You (don't) like {tag1} movies especially if they are {tag2} -- for ((tag1,+), (tag2,++)) or ((tag1,-),(tag2,--))
2. You (don't) like {tag1} movies especially if they are not {tag2} -- for ((tag1,++),(tag2,+)) or ((tag1,--),(tag2,-))
3. You (don't) like {tag1} movies unless they are {tag2} -- for ((tag1,+),(tag2,-)) or ((tag1,-),(tag2,+))
4. You (don't) like {tag1} movies if they are {tag2} -- for ((tag1,N),(tag2,+)) or ((tag1,N),(tag2,-))


### Function get_summary
This function is to generate phrase based on above basic template, it will take tuple of tag1 and tag2, where each contain a pair of text and the sign (+,++,-, etc.), and return the phrase.

In this function, it will generate sentence 1 and sentence 2 by calling function get_first_sentence, and get_second_sentence. The generated sentence 1 and 2 will have similar meaning with the sentence it replaced, it is just can have different pattern. The tag text will not be changed.

Example from the first basic template:

<img src="screenshot.png" width="400" height="200" >


### Function get_first_sentence
It is to generate sentence 1 part, by picking random pattern which is basically different ways of saying the basic form. 
In each sentence pattern, it will also pick randomly different words for expressing like or don't like.
The pattern for the sentence is manually picked (there are 3 patterns), and the different expresison of like/don't like is also manually picked.

### Function get_second_sentence
It is to generate sentence 2 part, with the same way of get_first_sentence function.


In [337]:
### Basic_2 templates

def get_first_sentence(tag,dictionary_tags,sentiment = 1):
    group = dictionary_tags[tag]['POS_Tags_Group']
    entity = dictionary_tags[tag]['Entity']  
    person_type = dictionary_tags[tag]['Person_Type']
    end_with_movie = dictionary_tags[tag]['End_with_movie']
    start_pos = dictionary_tags[tag]['Start_POS']
    end_pos = dictionary_tags[tag]['End_POS']
    genre = dictionary_tags[tag]['genre']
    
    #if end with movie/movies/film/films
    if end_with_movie == True:
         statement_patterns = ["You {} [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag)]#, #Basic pattern
            
    elif entity == 'GPE':
        statement_patterns = ["You {} movies about [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag),
                             "You {} movies from [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag)]
    elif genre==True:
        statement_patterns = ["You {} [{}] movies".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag),
                             "You {} movies with [{}] {}".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag,random.choice(['genre','content'])),
                             "You {} movies full of [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag)]
        
    
    #elif tag.startswith('oscar'): ## Since oscar is a famous award, make own rule for it
    #    statement_patterns = ["You {} '{}' movies".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag)]
    
    elif ((group == 'noun') | (group == 'numeral')) & (not tag.startswith("oscar")):
        if entity == 'PERSON':
            if person_type == 'actor':
                  statement_patterns = ["You {} movies starred by [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag), #Basic pattern
                                       "You {} movies starring [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag),   
                                        "You {} movies played by [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag),
                                        "[{}] movies is {}for you".format(tag.capitalize(), "" if sentiment==1 else "not ")]
            elif (person_type == 'director'):
                statement_patterns = ["You {} movies directed by [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag), #Basic pattern
                          "[{}] movies is {}for you".format(tag.capitalize(), "" if sentiment==1 else "not ")]
            else:
                statement_patterns = ["You {} movies about [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag)]
    
        elif ((start_pos.startswith('JJ')) | (start_pos=='DT')) & (end_pos=='VBG'):
            statement_patterns = ["You {} movies with [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag)]
    
        else:
            statement_patterns = ["You {} movies about [{}]".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag)]
    

    else : #adjective & other
        statement_patterns = ["You {} [{}] movies".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag), #Basic pattern
                              "[{}] movies is {}for you".format(tag.capitalize(), "" if sentiment==1 else "not ")]
                              #  "You {} movies filled with '{}'".format(random.choice(like_synonyms) if sentiment == 1 else random.choice(dont_like_synonyms),tag)]

    return(random.choice(statement_patterns))

def get_second_sentence(tag,dictionary_tags,sentiment = 1):
    group = dictionary_tags[tag]['POS_Tags_Group']
    entity = dictionary_tags[tag]['Entity']
    person_type = dictionary_tags[tag]['Person_Type']
    end_with_movie =dictionary_tags[tag]['End_with_movie']
    start_pos =dictionary_tags[tag]['Start_POS']
    end_pos =dictionary_tags[tag]['End_POS']
    genre = dictionary_tags[tag]['genre']
    
    
    if end_with_movie == True:
        statement_patterns = ["they are {}[{}]".format("" if sentiment==1 else "not ",tag)]#,#Basic pattern for second sentence
    
    elif entity == 'GPE':
        statement_patterns = ["they are {}about [{}]".format("" if sentiment==1 else "not ",tag),
                             "they are {}from [{}]".format("" if sentiment==1 else "not ",tag)]
    
    elif tag.startswith('oscar'): ## Since oscar is a famous award, make own rule for it
        statement_patterns = ["they are {}[{}] movies".format("" if sentiment==1 else "not ",tag)]
  
    elif genre == True:
        statement_patterns = ["they are {}full of [{}]".format("" if sentiment==1 else "not ",tag),
                             "they {} [{}]".format("contain" if sentiment==1 else "don't contain",tag),
                             "they are {}[{}]".format("" if sentiment==1 else "not ",tag)]
    
    elif (group == 'noun') | (group == 'numeral'):
        if entity == 'PERSON':
            if person_type == 'actor':
                  statement_patterns = ["they are {}starred by [{}]".format("" if sentiment==1 else "not ",tag), #Basic pattern
                          "they are {}starring [{}]".format("" if sentiment==1 else "not ",tag),
                          "they are {}played by [{}]".format("" if sentiment==1 else "not ",tag)]
            elif (person_type == 'director'):
                statement_patterns = ["they are {}directed by [{}]".format("" if sentiment==1 else "not ",tag),
                                      "they are {}movies by [{}]".format("" if sentiment==1 else "not ",tag)]
            else:
                statement_patterns = ["they are {}about [{}]".format("" if sentiment==1 else "not ",tag),
                                 "they are {}[{}] movies".format("" if sentiment==1 else "not ",tag)]
        
        elif ((start_pos.startswith('JJ')) | (start_pos=='DT')) & (end_pos=='VBG'):
            statement_patterns = ["they are {}with [{}]".format("" if sentiment==1 else "not ",tag)]
                
        else:
            statement_patterns = ["they are {}about [{}]".format("" if sentiment==1 else "not ",tag)]
    

    else : #adjectives and others, also if end_with_movie == True
    # Below are manually set pattern for first sentence with positive or negative sentiment which can be randomly chosen
        statement_patterns = ["they are {}[{}]".format("" if sentiment==1 else "not ",tag)]#,#Basic pattern for second sentence
                         # $"they are {}full of '{}'".format("" if sentiment==1 else "not ", tag),
                        #"they {} high '{}' content".format("have" if sentiment==1 else "don't have",tag)]
   
    
    
    return(random.choice(statement_patterns))

def get_summary_basic_2(tag1,tag2,dictionary_tags):
    " Function to generate template_based summary/phrase, using basic template as above explanation in markdown"
    
    if len(tag1[1]) ==1:
        
        if len(tag2[1])==2:
            #1st template (+,++) or (-,--)
            sentiment_1 = 1 if tag1[1]=='+' else 0
            return "{}, {} if {}.".format(get_first_sentence(tag1[0],dictionary_tags,sentiment_1),random.choice(especially_synonyms),get_second_sentence(tag2[0],dictionary_tags)) #especially
            
        else:
            #4th template (N,+) or (N,-)
            if tag1[1] == 'N':
                sentiment_1 = 1 if tag2[1]=='+' else 0
                return "{} if {}.".format(get_first_sentence(tag1[0],dictionary_tags,sentiment_1),get_second_sentence(tag2[0],dictionary_tags))
    
            #3rd template (+,-) or (-,+)    
            else:
                    sentiment_1 = 1 if tag1[1]=='+' else 0
                    return "{}, {} {}.".format(get_first_sentence(tag1[0],dictionary_tags,sentiment_1),random.choice(unless_synonyms),get_second_sentence(tag2[0],dictionary_tags)) #unless
    
    #2nd template (++,+) or (--,-)
    else:
        sentiment_1 = 1 if tag1[1]=='++' else 0
        return "{}, {} if {}.".format(get_first_sentence(tag1[0],dictionary_tags,sentiment_1),random.choice(especially_synonyms),get_second_sentence(tag2[0],dictionary_tags,0))#especially
    

## 1.3 Get N preference statements
To generate n number of preference statement, with random tags and random preferences level

In [663]:
def generate_preference_statements(dictionary_tags,pair_tags_list=None,list_pair_preferences=None, num_sentence = 200): 
    '''
    param num_statements : number of statements to be generated
    param tags : list of tags. From this list, the tags for each statement will be randomly choosen
    '''
    list_preference_statements_basic = []
    list_preference_statements_adjusted = []
    list_preference = []
    list_tags = []
    list_postags = []
    
    
    if pair_tags_list == None:
        pair_tags_list =[]
        list_pair_preferences = []
        for i in range(num_sentence):
            pair_tags_list.append(random.sample(cleaned_all_tags,k=2))
            list_pair_preferences.append(random.choice(list_of_preference))
            

    for pair_tags,pair_preferences in zip(pair_tags_list,list_pair_preferences):
        #pair_tags = random.sample(tags,k=2)
        list_postags.append([dictionary_tags[pair_tags[0]]['POS_Tags'],dictionary_tags[pair_tags[1]]['POS_Tags']])
        #pair_preferences = random.choice(list_of_preference)
        pref_statement_basic = get_summary_basic_1((pair_tags[0],pair_preferences[0]),(pair_tags[1],pair_preferences[1]))
        pref_statement_adjusted = get_summary_basic_2((pair_tags[0],pair_preferences[0]),(pair_tags[1],pair_preferences[1]),dictionary_tags)
        #print (pref_statement)
        list_preference_statements_basic.append(pref_statement_basic)
        list_preference_statements_adjusted.append(pref_statement_adjusted)
        list_preference.append(pair_preferences)
        list_tags.append(pair_tags)
    return list_preference_statements_basic,list_preference_statements_adjusted, list_preference,list_tags, list_postags

## 1.4 Other function(s)

In [654]:
def save_to_file(title,list_preference_statements, list_preference,list_tags, list_postags):   
    '''
    Saving the generated statements into excel file
    '''
    # Write the generated sentences in a csv file for review
    list_tags = np.array(list_tags)
    list_preference = np.array(list_preference)
    list_postags = np.array(list_postags)
    
    pd.DataFrame({'Preference_statement': list_preference_statements,
                  'tag1': list_tags[0:,0],
                  'pos-tag1' : list_postags[0:,0],
                  'pref1': list_preference[0:,0],
                  'tag2': list_tags[0:,1],
                  'pos-tag2' : list_postags[0:,1],
                  'pref2': list_preference[0:,1]
    }).to_excel(title)

In [655]:
# Functions to save train data into several format, to suit the pre-trained models

def save_to_txt(title,list_sentence):
    MyFile=open(title,'w')

    for element in list_sentence:
        MyFile.write(element.replace('[',"").replace(']',""))
        MyFile.write('\n')
    MyFile.close()

def save_train_data(title,ori,para1,para2,para3):
    MyFile=open(title,'w',encoding='UTF-8')

    for ori,para1,para2,para3 in zip(ori,para1,para2,para3):
        MyFile.write(ori.replace('[',"").replace(']',"")+'>>>>>>'+para1.replace('[',"").replace(']',"")+'>>>>>>'+para2.replace('[',"").replace(']',"")+'>>>>>>'+para3.replace('[',"").replace(']',""))
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
    MyFile.close()
    
def save_train_data_1(title,ori,para1):
    MyFile=open(title,'w',encoding='UTF-8')

    for ori,para1 in zip(ori,para1):
        MyFile.write(ori.replace('[',"").replace(']',"")+'>>>>>>'+para1.replace('[',"").replace(']',""))
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
    MyFile.close()
    
def save_train_data_2(title,ori,para1,para2,para3):
    MyFile=open(title,'w',encoding='UTF-8')

    for ori,para1,para2,para3 in zip(ori,para1,para2,para3):
        MyFile.write(ori.replace('[',"").replace(']',"")+'>>>>>>'+para1.replace('[',"").replace(']',""))
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
        MyFile.write(ori.replace('[',"").replace(']',"")+'>>>>>>'+para2.replace('[',"").replace(']',""))
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
        MyFile.write(ori.replace('[',"").replace(']',"")+'>>>>>>'+para3.replace('[',"").replace(']',""))
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
    MyFile.close()

def save_test_data(title,list_sentence_basic):
    MyFile=open(title,'w')

    for ori in list_sentence_basic:
        MyFile.write(ori.replace('[',"").replace(']',"")+'>>>>>>')
        MyFile.write('\n')
    MyFile.close()

In [656]:
#Generate pair of tags:
import numpy as np
first_tags = []
second_tags = []
synonym_first_tags = []
synonym_second_tags = []
pair_tags = [] #for for ori and paraphrase 1

number_of_sentences = 100000

for i in range(number_of_sentences):
    pair= random.sample(cleaned_train_tags,k=2) # get data from train split
    pair_tags.append(pair)
    first_tags.append(pair[0])
    synonym_first_tags.append(np.random.choice(list(train_synonym_dictionary[pair[0]].keys())[0:3],p=[0.7,0.2,0.1]))
    second_tags.append(pair[1])
    synonym_second_tags.append(np.random.choice(list(train_synonym_dictionary[pair[1]].keys())[0:3],p=[0.7,0.2,0.1]))
    
pair_tags_para2 = list(zip(synonym_first_tags,second_tags))
pair_tags_para3 = list(zip(first_tags,synonym_second_tags))

In [657]:
#dump_pickle('data/dataset/train_data_2/pair_tags_ori_100k.pkl',pair_tags)
#dump_pickle('data/dataset/train_data_2/pair_tags_para1_100k.pkl',pair_tags)
#dump_pickle('data/dataset/train_data_2/pair_tags_para2_100k.pkl',pair_tags_para2)
#dump_pickle('data/dataset/train_data_2/pair_tags_para3_100k.pkl',pair_tags_para3)

## Re generate preference statements for review

In [665]:
list_preference_statements_basic,list_preference_statements_adjusted, list_preference, list_tags, list_postags = generate_preference_statements(all_tags)
save_to_file("basic_user_preferences.xlsx",list_preference_statements_basic, list_preference,list_tags, list_postags) 
save_to_file("adjusted_user_preferences.xlsx",list_preference_statements_adjusted, list_preference,list_tags, list_postags)    

# 3. Generate Preference statements

In [278]:
#Load selected tags and pair user preferences which have been selected previously to generate statements using both template, and using the same tags and pairs user preference
pair_tags = load_pickle('data/dataset/train_data_2/pair_tags_ori_50k.pkl')
pair_tags = load_pickle('data/dataset/train_data_2/pair_tags_para1_50k.pkl')
pair_tags_para2=load_pickle('data/dataset/train_data_2/pair_tags_para2_50k.pkl')
pair_tags_para3 = load_pickle('data/dataset/train_data_2/pair_tags_para3_50k.pkl')
list_pair_preferences = load_pickle('data/dataset/train_data_2/pair_preferences_50k.pkl')

In [282]:
# Generate preference statements with all filtered tags
list_preference_statements_basic,list_preference_statements_adjusted, list_preference, list_tags, list_postags = generate_preference_statements(train_tags,pair_tags_para3,list_pair_preferences)
#save_to_file('data/dataset/train_data_2/preference_statement_train2_basic_50k.xlsx',list_preference_statements_basic, list_preference,list_tags,list_postags)
save_to_file('data/dataset/train_data_2/preference_statement_train2_para3_50k.xlsx',list_preference_statements_adjusted, list_preference,list_tags,list_postags)
#dump_pickle('data/dataset/train_data_2/list_preference_statements_train2_basic_50k.pkl',list_preference_statements_basic)
dump_pickle('data/dataset/train_data_2/list_preference_statements_train2_para3_50k.pkl',list_preference_statements_adjusted)

# 4. Generate train Data

In [328]:
ori = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_basic_100k.pkl')
para1 = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_para1_100k.pkl')
para2 = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_para2_100k.pkl')
para3 = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_para3_100k.pkl')

#save_train_data('data/dataset/train_data_2/utf_8_train_data/train_2_100k_ver1.txt',ori,para1,para2,para3)
save_train_data_2('data/dataset/train_data_2/utf_8_train_data/train_2_100k_ver2.txt',ori,para1,para2,para3)

In [323]:
ori = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_basic_100k.pkl')
para1 = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_para1_100k.pkl')
#para2 = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_para2_10k.pkl')
#para3 = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_para3_10k.pkl')

save_train_data_1('data/dataset/train_data_1/New train1 dataset/train_1_100k.txt',ori,para1)

# 5. Generate test Data

In [330]:
n_test_data = 2000
pair_test_preferences = []
pair_test_tags = []

for i in range(n_test_data):
    pair_test_tags.append(random.sample(cleaned_test_tags,k=2))
    pair_test_preferences.append(random.choice(list_of_preference))
dump_pickle('data/dataset/test_data/pair_test_tags_2000.pkl',pair_test_tags)
dump_pickle('data/dataset/test_data/pair_test_preferences_2000.pkl',pair_test_preferences)  

In [341]:
list_preference_statements_basic,list_preference_statements_adjusted, list_preference, list_tags, list_postags = generate_preference_statements(test_tags,pair_test_tags,pair_test_preferences)
save_to_file('data/dataset/test_data/preference_statement_test_basic_2000.xlsx',list_preference_statements_basic, list_preference,list_tags,list_postags)
save_to_file('data/dataset/test_data/preference_statement_test_adjusted_2000.xlsx',list_preference_statements_adjusted, list_preference,list_tags,list_postags)
dump_pickle('data/dataset/test_data/list_preference_statements_test_basic_2000.pkl',list_preference_statements_basic)
dump_pickle('data/dataset/test_data/list_preference_statements_test_adjusted_2000.pkl',list_preference_statements_adjusted)

In [342]:
save_test_data('data/dataset/test_data/test_2000_basic.txt',list_preference_statements_basic)
save_test_data('data/dataset/test_data/test_2000_adjusted.txt', list_preference_statements_adjusted)

# 6. Generate MSCOCO Data

In [305]:
file1 = open("data/mscoco/train_source.txt",'r')
mscoco_source = file1.readlines()
file2 = open("data/mscoco/train_target.txt",'r')
mscoco_target = file2.readlines()

mscoco_source = [x.replace("\n",'') for x in mscoco_source]#[264930:]
mscoco_target = [x.replace("\n",'') for x in mscoco_target]#[264930:]

In [306]:
#save_train_data_1("data/mscoco/mscoco_val.txt",mscoco_source,mscoco_target)

In [108]:
def save_mixed_train_data_2(title,ori,para1,para2,para3,mscoco_source,mscoco_target):
    MyFile=open(title,'w',encoding='UTF-8')

    for ori,para1,para2,para3,mscoco_src,mscoco_tgt in zip(ori,para1,para2,para3,mscoco_source,mscoco_target):
        MyFile.write(ori.replace('[',"").replace(']',"")+'>>>>>>'+para1.replace('[',"").replace(']',""))
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
        MyFile.write(ori.replace('[',"").replace(']',"")+'>>>>>>'+para2.replace('[',"").replace(']',""))
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
        MyFile.write(ori.replace('[',"").replace(']',"")+'>>>>>>'+para3.replace('[',"").replace(']',""))
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
        MyFile.write(mscoco_src[0]+'>>>>>>'+mscoco_tgt[0])
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
        MyFile.write(mscoco_src[1]+'>>>>>>'+mscoco_tgt[1])
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
    MyFile.close()

In [140]:
def save_mixed_train_data_1(title,ori,para1,mscoco_source,mscoco_target):
    MyFile=open(title,'w',encoding='UTF-8')

    for ori,para1,mscoco_src,mscoco_tgt in zip(ori,para1,mscoco_source,mscoco_target):
        MyFile.write(ori.replace('[',"").replace(']',"")+'>>>>>>'+para1.replace('[',"").replace(']',""))
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
        MyFile.write(mscoco_src+'>>>>>>'+mscoco_tgt)
        MyFile.write('\n')
        MyFile.write('<|end of text|>')
        MyFile.write('\n')
    MyFile.close()

# 7. Generate Mixed Data

In [110]:
ori = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_basic_50k.pkl')
para1 = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_para1_50k.pkl')
para2 = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_para2_50k.pkl')
para3 = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_para3_50k.pkl')
mscoco_source_10k = np.array(mscoco_source[0:100000]).reshape(50000,2)
mscoco_target_10k = np.array(mscoco_target[0:100000]).reshape(50000,2)
#save_train_data('data/dataset/train_data_2/utf_8_train_data/train_2_100k_ver1.txt',ori,para1,para2,para3)
save_mixed_train_data_2('data/dataset/train_data_2/utf_8_train_data/mixed_train_2_50k_ver2.txt',ori,para1,para2,para3,mscoco_source_10k,mscoco_target_10k)

In [142]:
ori = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_basic_50k.pkl')
para1 = load_pickle('data/dataset/train_data_2/list_preference_statements_train2_para1_50k.pkl')
mscoco_source_50k = mscoco_source[0:50000]
mscoco_target_50k = mscoco_target[0:50000]
#save_train_data('data/dataset/train_data_2/utf_8_train_data/train_2_100k_ver1.txt',ori,para1,para2,para3)
save_mixed_train_data_1('data/dataset/train_data_1/New train1 dataset/mixed_train_1_50k.txt',ori,para1,mscoco_source_50k,mscoco_target_50k)

# Generate statements from template-based approac for Comparison with the neural paraphrase approach (for questionnaire)
generate from test_20_final.txt , since this is not for training neural model, the synonym can be taken from all tags, since if it is only in the test tags then it wont resulted in good synonyms

In [361]:
all_tagged_tags = load_pickle("data/output/tagged_tags_dictionary_withsents_film_17_05_20.pkl")

In [365]:
all_synonyms = load_pickle("data/tags_synonym/top_10_synonym_all.pkl")

In [409]:
pair_test_data_synonym=[]
for pair in pair_test_tags:
    synonym_0 = np.random.choice(list(all_synonyms[pair[0]].keys())[0:3],p=[0.7,0.2,0.1])
    synonym_1 = np.random.choice(list(all_synonyms[pair[1]].keys())[0:3],p=[0.7,0.2,0.1])
    pair_test_data_synonym.append((synonym_0,synonym_1))
    

In [633]:
for i in train_synonym_dictionary:
    if "scandal" in i:
        print (i)