In [None]:
#mount google drive

from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
! ls '/content/gdrive/MyDrive/humor_generation/datasets/'

glove.840B.300d.txt	    News_Category_Dataset_v2.json
glove.840B.300d.zip.1	    news-headlines-humor.csv
humorous_words_dataset.csv  news-headlines-humor.gsheet


### Pipeline Structure:

1. **Preprocessing**: Identify "object" & "oblique nominal" nouns, as well as the verbs which they are associated with.
2. **Word Context Replacement**: Train a CBOW model to identify alternatives to the target object.
3. **Silly Synonyms**: Find "sillier" words to use, instead of the object and associated verb.
4. **Scoring**: After the word replacements, filter out the outputs that score low on grammaticality, and prefer those that have high "surprisal ratios".

In [None]:
# install stanza

!pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/e7/8b/3a9e7a8d8cb14ad6afffc3983b7a7322a3a24d94ebc978a70746fcffc085/stanza-1.1.1-py3-none-any.whl (227kB)
[K     |█▍                              | 10kB 227kB/s eta 0:00:01[K     |██▉                             | 20kB 451kB/s eta 0:00:01[K     |████▎                           | 30kB 655kB/s eta 0:00:01[K     |█████▊                          | 40kB 823kB/s eta 0:00:01[K     |███████▏                        | 51kB 1.0MB/s eta 0:00:01[K     |████████▋                       | 61kB 1.2MB/s eta 0:00:01[K     |██████████                      | 71kB 1.3MB/s eta 0:00:01[K     |███████████▌                    | 81kB 1.5MB/s eta 0:00:01[K     |█████████████                   | 92kB 1.7MB/s eta 0:00:01[K     |██████████████▍                 | 102kB 1.8MB/s eta 0:00:01[K     |███████████████▉                | 112kB 1.8MB/s eta 0:00:01[K     |█████████████████▎              | 122kB 1.8MB/s eta 0:00:

In [None]:
import stanza
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 19.5MB/s]                    
2020-12-06 23:39:38 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/default.zip: 100%|██████████| 428M/428M [06:32<00:00, 1.09MB/s]
2020-12-06 23:46:19 INFO: Finished downloading models and saved to /root/stanza_resources.


In [None]:
import stanza

nlp = stanza.Pipeline('en')

2020-12-07 00:31:51 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-12-07 00:31:51 INFO: Use device: cpu
2020-12-07 00:31:51 INFO: Loading: tokenize
2020-12-07 00:31:51 INFO: Loading: pos
2020-12-07 00:31:53 INFO: Loading: lemma
2020-12-07 00:31:53 INFO: Loading: depparse
2020-12-07 00:31:55 INFO: Loading: sentiment
2020-12-07 00:31:57 INFO: Loading: ner
2020-12-07 00:31:59 INFO: Done loading processors!


In [None]:
# first, open up the file and go through the sentences.
import stanza
import csv

nlp = stanza.Pipeline('en')

# once a proper target POS is found, parse it & save results.
# inputs:
#     wd: the word that has been tokenized & parsed out.
#     sent: the fully parsed sentence & all associated metadata.
def parse_tgt_pos(wd, sent, hdl_orig, id):
  hdl_final = hdl_orig
  tgt_text = wd.text
  tgt_loc = wd.misc

  # id the 'action' verb, if there is one.
  vb_text = ''
  vb_loc = ''
  head_ind = int(wd.head)-1
  head = sent.words[head_ind]
  if head.upos == 'VERB':
    vb_text = head.text
    vb_loc = head.misc

  # delete 'flat' noun modifiers.
  wd_id = int(wd.id)-1
  wd_id += 1
  del_start = 100000000000
  del_end = -1
  while wd_id <= len(sent.words)-1:
    wi = sent.words[wd_id]
    if wi.deprel == 'flat':
      print('[debug] deleting "flat" modifier', wi.text)
      s, e = wi.misc.split('|')
      sind = int(s.split('=')[1])
      eind = int(e.split('=')[1])
      del_start = min(del_start, sind)
      del_end = max(del_end, eind)
    else:
      break
    wd_id += 1
  
  ### NOTE: IF THE VERB COMES AFTER THE MODIFIER, THIS IS DANGEROUS...
  if del_end > -1:
    hdl_final = hdl_orig[:del_start] + hdl_orig[(del_end+1):]
    print("[debug] modified sentence: ", hdl_final)
  
  final_row = [id, hdl_final, tgt_text, tgt_loc, vb_text, vb_loc]
  return final_row


def parse_id_vb_tgt(input_path, output_path):
  with open(output_path, 'w') as fout:
    out_writer = csv.writer(fout)
    out_writer.writerow(['id', 'sentence', 'target text', 'target loc', 'verb text', 'verb loc'])  # header: [id, hdl_final, tgt_text, tgt_loc, vb_text, vb_loc]
    # open the input data file & process the data.
    with open(input_path) as f:
      news_reader = csv.reader(f)
      next(news_reader)  # skip the header
      #read through the content.
      ctr = 0
      for line in news_reader:
        ctr += 1
        id, hdl_orig, _ = line
        # first, disregard the special characters; do dep. parsing
        hdl_orig = hdl_orig.replace('<','').replace('/>', '')
        doc = nlp(hdl_orig)

        # next, identify the "object", identify "verb", delete "flat" wds
        for sent in doc.sentences:
          for wd in sent.words:
            # obj = object of verb, obl = verb modifier, nmod = noun modifier
            #   [note] taking 'nmod', it's very common & hit or miss...
            if wd.deprel in set(['obj', 'obl']):
              output_row = parse_tgt_pos(wd, sent, hdl_orig, id)
              if ctr <= 9:
                print(output_row)
                print('\n')
              # write output to file
              out_writer.writerow(output_row)


#--------------------------#
# actually run this function...
input_path = '/content/gdrive/MyDrive/humor_generation/datasets/news-headlines-humor.csv'
output_path = '/content/gdrive/MyDrive/humor_generation/datasets/news-headlines-humor-parsed.csv'

parse_id_vb_tgt(input_path, output_path)
          


2020-12-07 02:43:30 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-12-07 02:43:30 INFO: Use device: cpu
2020-12-07 02:43:30 INFO: Loading: tokenize
2020-12-07 02:43:30 INFO: Loading: pos
2020-12-07 02:43:31 INFO: Loading: lemma
2020-12-07 02:43:31 INFO: Loading: depparse
2020-12-07 02:43:33 INFO: Loading: sentiment
2020-12-07 02:43:34 INFO: Loading: ner
2020-12-07 02:43:35 INFO: Done loading processors!


['1723', 'Thousands of gay and bisexual men convicted of long-abolished sexual offences are posthumously pardoned', 'offences', 'start_char=69|end_char=77', 'convicted', 'start_char=34|end_char=43']


[debug] deleting "flat" modifier Russia
[debug] modified sentence:  Special prosecutor appointed to Trump 
['12736', 'Special prosecutor appointed to Trump ', 'Trump', 'start_char=32|end_char=37', 'appointed', 'start_char=19|end_char=28']


['12274', 'Spanish police detain man and search Ripoll addresses in hunt for terror suspects ', 'man', 'start_char=22|end_char=25', 'detain', 'start_char=15|end_char=21']


['12274', 'Spanish police detain man and search Ripoll addresses in hunt for terror suspects ', 'hunt', 'start_char=57|end_char=61', 'addresses', 'start_char=44|end_char=53']


['12274', 'Spanish police detain man and search Ripoll addresses in hunt for terror suspects ', 'suspects', 'start_char=73|end_char=81', 'addresses', 'start_char=44|end_char=53']


['8823', "N.Y. Times reprim