## Prepare Referece files using TFIDF for retrieving attributes


In [0]:
from google.colab import drive
#drive.mount('/content/gdrive')

drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [0]:
import pandas as pd
from tqdm import tqdm, trange
import numpy as np
import time
import torch
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
def read_file(path):
    with open(path) as fp:
        lines = fp.read().splitlines()
    return lines

In [0]:
def clean_text(text):
    return text.replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","").replace("<END>","").strip()


In [0]:
#I think this needs to happen before I can run the proceeding cells.
% cd '/content/gdrive/My Drive/humor_style_transfer/reddit_jokes/joke-dataset/style_trans_preprocessing'
#%ls

[0m[01;34mbert_classifier_training[0m/  jokes.test.1
joke_captions              jokes.train.0
jokes.dev.0                jokes.train.1
jokes.dev.1                [01;34mprocessed_files_with_bert_with_best_head[0m/
jokes.test.0


In [0]:
### [customize] change file paths to proper dataset.

train0_org = read_file("jokes.train.0") # Training data of negative sentiment
train1_org = read_file("jokes.train.1") # Training data of positive sentiment
## don't have ref files... replacing with "test"...
ref0_processed = read_file("./processed_files_with_bert_with_best_head/jokes.test.0") # Reference data for delete_generate model
ref1_processed = read_file("./processed_files_with_bert_with_best_head/jokes.test.1") # Reference data for delete_generate model
ref0_org = read_file("./jokes.test.0") # Original Refrence_0 data
ref1_org = read_file("./jokes.test.1") # Original Refrence_1 data
train0_processed = read_file("./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/jokes_train_0_all_attrs.txt") # training data with content and attributes seperation
train1_processed = read_file("./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/jokes_train_1_all_attrs.txt") # training data with content and attributes seperation

In [0]:
# Get the Original Reference Sentence
ref0_org = [x.split("\t")[0] for x in ref0_org]
ref1_org = [x.split("\t")[0] for x in ref1_org]

In [0]:
# Get the Content of the Reference Sentences
ref0_con = [clean_text(x) for x in ref0_processed]
ref1_con = [clean_text(x) for x in ref1_processed]

In [0]:
ref0_org[:4], ref0_con[:4]

(["ever since joes has changed hands it 's just gotten worse and worse .",
  'there is definitely not enough room in that part of the venue .',
  'so basically tasted watered down .',
  "she said she 'd be back and disappeared for a few minutes ."],
 ["ever since joes has changed hands it ' s just gotten and worse . ever since joes has changed hands it ' s gotten and .",
  'there is not enough in that part of the venue . there is so much in that part of the venue',
  "so basically watered down . it didn ' t down at all .",
  "she she ' d be back and for a few . she said she ' d be back , and didn ' t disappear at all ."])

In [0]:
def get_train_content(text):
    return text.split("<START>")[0].split("<CON_START>")[1].strip()

In [0]:
def get_train_attrs(text):
    return text.split("<CON_START>")[0].replace("<ATTR_WORDS>","").strip().split()

In [0]:
get_train_attrs(train0_processed[0])

['sadly']

In [0]:
train0_processed[:4], train1_processed[:4]

(['<ATTR_WORDS> sadly <CON_START> i was mistaken . <START> i was sadly mistaken . <END>',
  '<ATTR_WORDS> so to , <CON_START> on the hoagies the italian is general run of the mill . <START> so on to the hoagies , the italian is general run of the mill . <END>',
  '<ATTR_WORDS> minimal meat <CON_START> and a ton of shredded lettuce . <START> minimal meat and a ton of shredded lettuce . <END>',
  '<ATTR_WORDS> nothing really & _num_ <CON_START> special not worthy of the $ _ num _ price tag . <START> nothing really special & not worthy of the $ _num_ price tag . <END>'],
 ['<ATTR_WORDS> excellent <CON_START> food . <START> excellent food . <END>',
  '<ATTR_WORDS> superb <CON_START> customer service . <START> superb customer service . <END>',
  '<ATTR_WORDS> also specials good <CON_START> they have daily and ice cream which is really . <START> they also have daily specials and ice cream which is really good . <END>',
  "<ATTR_WORDS> 's good <CON_START> it ' a toasted hoagie . <START> it 's

In [0]:
# get content
train0_con = [get_train_content(x) for x in train0_processed]
train1_con = [get_train_content(x) for x in train1_processed]

In [0]:
train0_con[:4], train1_con[:4]

(['i was mistaken .',
  'on the hoagies the italian is general run of the mill .',
  'and a ton of shredded lettuce .',
  'special not worthy of the $ _ num _ price tag .'],
 ['food .',
  'customer service .',
  'they have daily and ice cream which is really .',
  "it ' a toasted hoagie ."])

In [0]:
# Fatch attributes from the training data
attrs_neg = [get_train_attrs(x) for x in train0_processed]
attrs_pos = [get_train_attrs(x) for x in train1_processed]

In [0]:
# Get TFIDF vectors for Training and Reference
tfidf = TfidfVectorizer()
conts_vecs = tfidf.fit_transform(train0_con + train1_con)
conts_pos_vecs = conts_vecs[:len(train1_con)]
conts_neg_vecs = conts_vecs[len(train1_con):len(train1_con)+len(train0_con)]
conts_from_pos_ref_vecs = tfidf.transform(ref1_con)
conts_from_neg_ref_vecs = tfidf.transform(ref0_con)

Sid's notes: Above, we retrieve the positive and negative generated training data. Then we prepare to get tf-idf scores for all words. 
Below, I get lost. What is "Annoyance" exactly?

#### AnnoyIndex is used to store the TFIDF vectors of training set and retrieve nearest neighbours of the reference content 

In [0]:
!pip install annoy



In [0]:
from annoy import AnnoyIndex

In [0]:
train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
train1_tree = AnnoyIndex(conts_pos_vecs.shape[-1])

  """Entry point for launching an IPython kernel.
  


In [0]:
# We have randomly selected training samples to control the memory usage
neg_idxs = np.random.choice(conts_neg_vecs.shape[0], size=50000, replace=False)
pos_idxs = np.random.choice(conts_pos_vecs.shape[0], size=50000, replace=False)

In [0]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(neg_idxs)):
    np_array = conts_neg_vecs[neg_idxs[i]].toarray()[0]
    train0_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [01:16<00:00, 649.72it/s]


In [0]:
train0_tree.build(50)
train0_tree.save('tfidf_train0.ann')

True

In [0]:
ref1_con[0:3], " ".join(attrs_neg[neg_idxs[0]])

(["it ' s small yet they you right at home . it ' s small yet they make you like a stranger .",
  "i will going back and enjoying this great place ! i ' be going back and suffering at this terrible place !",
  'the were and a pour . the drinks were expensive and half full .'],
 'service')

In [0]:
%ls "./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf"

reference_0.txt  reference_1.txt


In [0]:
with open("./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/reference_1.txt", "w") as out_fp:
    for i in range(conts_from_pos_ref_vecs.shape[0]):
        x = conts_from_pos_ref_vecs[i].toarray()[0]
        inx,dis = train0_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref1_con[i]
        #ref_sen = processed_ref0[i].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","")
        #print(dis,"\t",ref0_org[i], "\t" ,train1_data[inx[0]], train1_attr[inx[0]])
        out_str = "<ATTR_WORDS> " + " ".join(attrs_neg[neg_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(out_str)
        out_fp.write(out_str)

<ATTR_WORDS> did not anything <CON_START> it ' s small yet they you right at home . it ' s small yet they make you like a stranger . <START>

<ATTR_WORDS> 've here <CON_START> i will going back and enjoying this great place ! i ' be going back and suffering at this terrible place ! <START>

<ATTR_WORDS> but _num_ star for <CON_START> the were and a pour . the drinks were expensive and half full . <START>

<ATTR_WORDS> also , does n't <CON_START> my husband got a ruben , he it . my husband got a reuben sandwich , he it . <START>

<ATTR_WORDS> audi , <CON_START> i up for their email and got a coupon . i signed up for their email and . <START>

<ATTR_WORDS> from nothing previous <CON_START> i ' d definitely recommend giving them a . i ' d definitely recommend giving them a . <START>

<ATTR_WORDS> 's enough <CON_START> i highly e & m . i highly avoiding e & m painting . <START>

<ATTR_WORDS> staff , <CON_START> a great and we will go again . otherwise a terrible and we will not go again . 

In [0]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(pos_idxs)):
    np_array = conts_pos_vecs[pos_idxs[i]].toarray()[0]
    train1_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [01:24<00:00, 592.98it/s]


In [0]:
train1_tree.build(50)
train1_tree.save('tfidf_train1.ann')

True

In [0]:
with open("./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/reference_0.txt", "w") as out_fp:
    for i in range(conts_from_neg_ref_vecs.shape[0]):
        x = conts_from_neg_ref_vecs[i].toarray()[0]
        inx,dis = train1_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref0_con[i]
        out_str = "<ATTR_WORDS> " + " ".join(attrs_pos[pos_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(i, out_str)
        out_fp.write(out_str)

0 <ATTR_WORDS> service been <CON_START> ever since joes has changed hands it ' s just gotten and worse . ever since joes has changed hands it ' s gotten and . <START>

1 <ATTR_WORDS> place nice stores <CON_START> there is not enough in that part of the venue . there is so much in that part of the venue <START>

2 <ATTR_WORDS> also other 's <CON_START> so basically watered down . it didn ' t down at all . <START>

3 <ATTR_WORDS> definitely more <CON_START> she she ' d be back and for a few . she said she ' d be back , and didn ' t disappear at all . <START>

4 <ATTR_WORDS> rooms amazing set-up <CON_START> i ca n ' t believe how this pharmacy is . this is really . <START>

5 <ATTR_WORDS> great <CON_START> and took it off the bill . left and put it on the bill . <START>

6 <ATTR_WORDS> 's also <CON_START> it is n ' t terrible , but it is n ' t very either . it is n ' t , but it is very . <START>

7 <ATTR_WORDS> always excellent if 's <CON_START> definitely that i could not use my birthday