## Prepare Referece files using TFIDF for retrieving attributes


In [1]:
from google.colab import drive
#drive.mount('/content/gdrive')

drive.mount("/content/gdrive", force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
#switch to gpu
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [0]:
import pandas as pd
from tqdm import tqdm, trange
import numpy as np
import time
import torch
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
def read_file(path):
    with open(path) as fp:
        lines = fp.read().splitlines()
    return lines

In [0]:
def clean_text(text):
    return text.replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","").replace("<END>","").strip()


In [7]:
#I think this needs to happen before I can run the proceeding cells.
% cd '/content/gdrive/My Drive/humor_style_transfer/reddit_jokes/joke-dataset/style_trans_preprocessing'
#%ls

/content/gdrive/My Drive/humor_style_transfer/reddit_jokes/joke-dataset/style_trans_preprocessing


In [0]:
'''
Note on Reference files:
  - the reference file output from this file is used in inference
  -> ex: if we want to generate jokes from captions:
    * our input "ref0_processed" must be the captions (or captions which have undergone deletion), marked up w/ content & style tags.
    * our input "ref0_org" must be the captions, without any markup.
    * our output "reference1.txt" must be generated by combining each caption w/ closest elt in "annoyance" tree. 
'''

train0_org = read_file("jokes.train.0") # Training data of negative sentiment
train1_org = read_file("jokes.train.1") # Training data of positive sentiment

## run this file on the captions.
#ref0_processed = read_file("./processed_files_with_bert_with_best_head/jokes.test.0") # Reference data for delete_generate model
ref0_processed = read_file('/content/gdrive/My Drive/humor_style_transfer/flickr8k/flickr_drg_input_v1.txt')
#ref0_org = read_file("./jokes.test.0") # Original Refrence_0 data
ref0_org = read_file("/content/gdrive/My Drive/humor_style_transfer/flickr8k/flickr_drg_input.txt")

train0_processed = read_file("./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/jokes_train_0_all_attrs.txt") # training data with content and attributes seperation
train1_processed = read_file("./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/jokes_train_1_all_attrs.txt") # training data with content and attributes seperation

In [9]:
#find the caption file we want
#%ls '/content/gdrive/My Drive/humor_style_transfer/flickr8k/'
!head -10 /content/gdrive/My\ Drive/humor_style_transfer/flickr8k/flickr_drg_input_v1.txt
#!head -10 ./jokes.test.0

<POS> <CON_START> child in a pink dress is climbing up a set of stairs in an entry way . <START>
<POS> <CON_START> girl going into a wooden building . <START>
<POS> <CON_START> little girl climbing into a wooden playhouse . <START>
<POS> <CON_START> little girl climbing the stairs to her playhouse . <START>
<POS> <CON_START> little girl in a pink dress going into a wooden cabin . <START>
<POS> <CON_START> black dog and a spotted dog are . <START>
<POS> <CON_START> black dog and a tri-colored dog playing with each other on the road . <START>
<POS> <CON_START> black dog and a white dog with brown spots are staring at each other in the street . <START>
<POS> <CON_START> dogs of different breeds looking at each other on the road . <START>
<POS> <CON_START> dogs on pavement moving toward each other . <START>


In [0]:
# Get the Content of the Reference Sentences
ref0_con = [clean_text(x) for x in ref0_processed]
#ref1_con = [clean_text(x) for x in ref1_processed]

In [11]:
ref0_org[:4], ref0_con[:4]

(['child in a pink dress is climbing up a set of stairs in an entry way .',
  'girl going into a wooden building .',
  'little girl climbing into a wooden playhouse .',
  'little girl climbing the stairs to her playhouse .'],
 ['child in a pink dress is climbing up a set of stairs in an entry way .',
  'girl going into a wooden building .',
  'little girl climbing into a wooden playhouse .',
  'little girl climbing the stairs to her playhouse .'])

In [0]:
def get_train_content(text):
    return text.split("<START>")[0].split("<CON_START>")[1].strip()

In [0]:
def get_train_attrs(text):
    return text.split("<CON_START>")[0].replace("<ATTR_WORDS>","").strip().split()

In [14]:
get_train_attrs(train0_processed[0])

['Heard', 'president?', 'He', 'got', 'impeached.']

In [25]:
train0_processed[:4], train1_processed[:4]

(['<ATTR_WORDS> Heard president? He got impeached. <CON_START> heard about the fruit that became a president ? he . <START> Heard about the fruit that became a president? He got impeached. <END>',
  '<ATTR_WORDS> Dad joke[nsfw]. "Hey joke" Dad:"Pussy" Son:"I don\'t it" Dad:"I know hahahaha" <CON_START> dad tell me a joke [ nsfw ] . " hey dad tell me a joke " dad : " " son : " i don \' t get it " <START> Dad tell me a joke[nsfw]. "Hey dad tell me a joke"  Dad:"Pussy"  Son:"I don\'t get it"  Dad:"I know hahahaha" <END>',
  '<ATTR_WORDS> NSFW My today. She charged peddling pussy. <CON_START> nsfw my girlfriend was arrested for riding her bike today . she was with . <START> NSFW My girlfriend was arrested for riding her bike today. She was charged with peddling pussy. <END>',
  "<ATTR_WORDS> How women's March? That hilarious. You feminists dick hard. <CON_START> how was there no jackass in a giant penis costume at the women ' s march ? that would of been hilarious . you would never see bea

In [0]:
# get content
train0_con = [get_train_content(x) for x in train0_processed]
train1_con = [get_train_content(x) for x in train1_processed]

In [16]:
train0_con[:4], train1_con[:4]

(['heard about the fruit that became a president ? he .',
  'dad tell me a joke [ nsfw ] . " hey dad tell me a joke " dad : " " son : " i don \' t get it "',
  'nsfw my girlfriend was arrested for riding her bike today . she was with .',
  "how was there no jackass in a giant penis costume at the women ' s march ? that would of been hilarious . you would never see beat a so ."],
 ['why do batteries feel lonely ? because they are in .',
  'why do they call " roach - clips " , " roach - clips " ? ? ? because was already . i know it \' s like a diagram of',
  "what ' s the difference between a chickpea and a green pea ? wouldn ' t pay $ 1000 to have a on him .",
  "i posted a question about the brightest star in the night sky , but all i got were joke replies . should ' ve added the ] ."])

In [0]:
# Fatch attributes from the training data
attrs_neg = [get_train_attrs(x) for x in train0_processed]
attrs_pos = [get_train_attrs(x) for x in train1_processed]

In [0]:
# Get TFIDF vectors for Training and Reference
tfidf = TfidfVectorizer()
conts_vecs = tfidf.fit_transform(train0_con + train1_con)
conts_pos_vecs = conts_vecs[:len(train1_con)]
conts_neg_vecs = conts_vecs[len(train1_con):len(train1_con)+len(train0_con)]
#conts_from_pos_ref_vecs = tfidf.transform(ref1_con)
conts_from_pos_ref_vecs = tfidf.transform(ref0_con)
conts_from_neg_ref_vecs = tfidf.transform(ref0_con)

Sid's notes: Above, we retrieve the positive and negative generated training data. Then we prepare to get tf-idf scores for all words. 
Below, I get lost. What is "Annoyance" exactly?

#### AnnoyIndex is used to store the TFIDF vectors of training set and retrieve nearest neighbours of the reference content 

In [19]:
!pip install annoy

Collecting annoy
[?25l  Downloading https://files.pythonhosted.org/packages/00/15/5a9db225ebda93a235aebd5e42bbf83ab7035e7e4783c6cb528c635c9afb/annoy-1.16.3.tar.gz (644kB)
[K     |▌                               | 10kB 21.7MB/s eta 0:00:01[K     |█                               | 20kB 4.1MB/s eta 0:00:01[K     |█▌                              | 30kB 5.3MB/s eta 0:00:01[K     |██                              | 40kB 5.6MB/s eta 0:00:01[K     |██▌                             | 51kB 4.6MB/s eta 0:00:01[K     |███                             | 61kB 5.1MB/s eta 0:00:01[K     |███▋                            | 71kB 5.6MB/s eta 0:00:01[K     |████                            | 81kB 6.0MB/s eta 0:00:01[K     |████▋                           | 92kB 6.4MB/s eta 0:00:01[K     |█████                           | 102kB 6.3MB/s eta 0:00:01[K     |█████▋                          | 112kB 6.3MB/s eta 0:00:01[K     |██████                          | 122kB 6.3MB/s eta 0:00:01[K    

In [0]:
from annoy import AnnoyIndex

In [21]:
#in case you want to reconstruct the tree...
print(conts_neg_vecs.shape)
print(conts_pos_vecs.shape)

(78416, 30849)
(55970, 30849)


In [0]:
train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
train1_tree = AnnoyIndex(conts_pos_vecs.shape[-1])

  """Entry point for launching an IPython kernel.
  


In [0]:
# We have randomly selected training samples to control the memory usage
neg_idxs = np.random.choice(conts_neg_vecs.shape[0], size=50000, replace=False)
pos_idxs = np.random.choice(conts_pos_vecs.shape[0], size=50000, replace=False)

In [0]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(neg_idxs)):
    np_array = conts_neg_vecs[neg_idxs[i]].toarray()[0]
    train0_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [03:24<00:00, 245.05it/s]


In [0]:
train0_tree.build(50)
train0_tree.save('tfidf_train0.ann')

True

In [23]:
#[alternative to above] (don't run if you just saved the tree)
train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
train0_tree.load('tfidf_train0.ann')

  


True

In [24]:
ref1_con[0:3], " ".join(attrs_neg[neg_idxs[0]])

NameError: ignored

In [35]:
# [CUSTOMIZE]: replace output file. (Remember, reference_1 --> bad jokes; reference_0 --> good jokes.)
maxlines = 1000
ctr = 0
with open("./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/flickr_reference_1.txt", "w") as out_fp:
    for i in range(conts_from_pos_ref_vecs.shape[0]):
        #potentially limit # lines.
        ctr += 1
        if ctr >= maxlines:
            break
        x = conts_from_pos_ref_vecs[i].toarray()[0]
        inx,dis = train0_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref0_con[i] #ref1 --> ref0
        #ref_sen = processed_ref0[i].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","")
        #print(dis,"\t",ref0_org[i], "\t" ,train1_data[inx[0]], train1_attr[inx[0]])
        out_str = "<ATTR_WORDS> " + " ".join(attrs_neg[neg_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(out_str)
        out_fp.write(out_str)

<ATTR_WORDS> What car? :P <CON_START> child in a pink dress is climbing up a set of stairs in an entry way . <START>

<ATTR_WORDS> How German? Mein Karmph <CON_START> girl going into a wooden building . <START>

<ATTR_WORDS> How German? Mein Karmph <CON_START> little girl climbing into a wooden playhouse . <START>

<ATTR_WORDS> My Life. I betting..What <CON_START> little girl climbing the stairs to her playhouse . <START>

<ATTR_WORDS> Which Area 51? The C.I.ayy <CON_START> little girl in a pink dress going into a wooden cabin . <START>

<ATTR_WORDS> A I up. What neverland? She peed pants (read loud) <CON_START> black dog and a spotted dog are . <START>

<ATTR_WORDS> What's cybercrime? At moment joke. <CON_START> black dog and a tri-colored dog playing with each other on the road . <START>

<ATTR_WORDS> What's cybercrime? At moment joke. <CON_START> black dog and a white dog with brown spots are staring at each other in the street . <START>

<ATTR_WORDS> What's sticky? A stick. *I'm I 

In [27]:
%ls ./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf

flickr_reference_0.txt  reference_0.txt
flickr_reference_1.txt  reference_1.txt


In [0]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(pos_idxs)):
    np_array = conts_pos_vecs[pos_idxs[i]].toarray()[0]
    train1_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [03:29<00:00, 238.48it/s]


In [0]:
train1_tree.build(50)
train1_tree.save('tfidf_train1.ann')

True

In [25]:
#[alternative to above] (don't run if you just saved the tree)
train1_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
train1_tree.load('tfidf_train1.ann')

  


True

In [26]:
with open("./processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/flickr_reference_0.txt", "w") as out_fp:
    for i in range(conts_from_neg_ref_vecs.shape[0]):
        x = conts_from_neg_ref_vecs[i].toarray()[0]
        inx,dis = train1_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref0_con[i]
        out_str = "<ATTR_WORDS> " + " ".join(attrs_pos[pos_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(i, out_str)
        out_fp.write(out_str)

0 <ATTR_WORDS> I've fat, alcoholic, transvestite. All eat, drink be Mary!!. <CON_START> child in a pink dress is climbing up a set of stairs in an entry way . <START>

1 <ATTR_WORDS> How idiot suspense? <CON_START> girl going into a wooden building . <START>

2 <ATTR_WORDS> I today... And I still time golf. <CON_START> little girl climbing into a wooden playhouse . <START>

3 <ATTR_WORDS> I today... And I still time golf. <CON_START> little girl climbing the stairs to her playhouse . <START>

4 <ATTR_WORDS> I've fat, alcoholic, transvestite. All eat, drink be Mary!!. <CON_START> little girl in a pink dress going into a wooden cabin . <START>

5 <ATTR_WORDS> I I girlfriend. Not fucking STACY YOU WHORE! <CON_START> black dog and a spotted dog are . <START>

6 <ATTR_WORDS> I I girlfriend. Not fucking STACY YOU WHORE! <CON_START> black dog and a tri-colored dog playing with each other on the road . <START>

7 <ATTR_WORDS> What live world? Molar opposites. <CON_START> black dog and a white 

KeyboardInterrupt: ignored