In [None]:
'''
  Here, we will try to train a CBOW model on news articles and Tweets.
  It should be able to find words that fit into a particular news headline.


  A couple of different options for datasets:
    1. https://www.kaggle.com/rmisra/news-category-dataset
      * This dataset contains a bunch of politics (and other subject) - related news headlines.
      ~ Let's maybe start with this one.

    2. https://www.kaggle.com/snapcrack/all-the-news
      * This dataset contains full news articles; you can separate them by publication.
      * In particular, I think it might be a good idea to train on Brietbart and have the model spew a bunch of ridiculous garbage.
      * Though remember, we are only filling in one word...
  


'''


!wget https://www.kaggle.com/rmisra/news-category-dataset/download


--2020-12-02 10:24:50--  https://www.kaggle.com/rmisra/news-category-dataset/download
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /account/login?titleType=dataset-downloads&showDatasetDownloadSkip=False&messageId=16&returnUrl=%2Frmisra%2Fnews-category-dataset%3Fresource%3Ddownload [following]
--2020-12-02 10:24:51--  https://www.kaggle.com/account/login?titleType=dataset-downloads&showDatasetDownloadSkip=False&messageId=16&returnUrl=%2Frmisra%2Fnews-category-dataset%3Fresource%3Ddownload
Reusing existing connection to www.kaggle.com:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘download’

download                [ <=>                ]   7.61K  --.-KB/s    in 0.04s   

2020-12-02 10:24:51 (211 KB/s) - ‘download’ saved [7793]



In [None]:
#mount google drive

from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls '/content/gdrive/MyDrive/humor_generation'
import sys
sys.path.append('/content/gdrive/MyDrive/humor_generation')

from cbow_helper import *

cbow_helper.py	cbow_news  pungen-master	 skipgram_surprise.ipynb
CBOW.ipynb	datasets   silly_synonyms.ipynb


# Preprocessing 1:

Let's read in the headline data & organize it.


In [None]:
# first, let's extract all the headlines
import json

pth = 'gdrive/MyDrive/humor_generation/datasets/News_Category_Dataset_v2.json'

hl_list = []
ctr = 0
with open(pth) as f:
  for line in f:
    # print(line)
    j = json.loads(line)
    if ctr < 10:
      print(j['category'])
      print(j)
      ctr += 1
    #break
    hl_list.append(j)


# single out the headline texts from political headlines.
print("\n\nIsolating Politics Headline text...")
politics_headlines = [j['headline'] for j in hl_list if j['category'] == 'POLITICS']
print(politics_headlines[:5])
print(len(politics_headlines)) 

# Preprocessing 2:

Tokenize and prepare the data to start training CBOW

In [None]:
# STEP 1
# tokenize the training data, and create an index of word ids-> words, as well as words -> word id's.
# inputs: "trdata": needs to be organized as a list of sentences included in the corpus
#         "trdata_dir": will contain indices & other stuff for the training data.

from keras.preprocessing import text
import pickle as pkl
import numpy as np

import sys
pth = '/content/gdrive/MyDrive/humor_generation'
if not (pth in sys.path):
  sys.path.append(pth)
from cbow_helper import cbow

def tokenize_trdata(trdata, trdata_dir):
  tokenizer = text.Tokenizer()
  tokenizer.fit_on_texts(trdata)  # this just needs a list of strings
  word2id = tokenizer.word_index
  # add padding...
  word2id['PAD'] = 0

  vocab_size = len(word2id)
  # create id2word & word2id; save both.
  id2word = {v:k for k, v in word2id.items()}

  # save word2id and id2word arrays as pickles. You will need them for later...
  with open('{}/word2id.pkl'.format(trdata_dir), 'wb') as fout:
    pkl.dump(word2id, fout)
  with open('{}/id2word.pkl'.format(trdata_dir), 'wb') as fout:
    pkl.dump(id2word, fout)

  # convert the corpus to a set of IDs.
  wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in politics_headlines]

  print("vocab size:", vocab_size)
  print("vocab items:", list(word2id.items())[:10] )

  return wids, word2id, id2word



# STEP 2
# convert tokenized dataset into tokenized positive examples, to train cbow.
# come up with a list of pairs (center_wd, [context_wds]). EPredict center word from context.

# inputs: 
#     trdata_tokenized: all the sentences need to be converted to lists of numerical tokens.
#     trdata & trdata_dir: same as step 1
# outputs:
#     outfile: will extract and store all the examples like: np.array([iword, *owords]), where iword is tokenized center word, and oword is the list of tokenized context wds.

def tokens_to_nn_input(trdata_dir, trdata, trdata_tokenized, outfile = 'train.bin'):

  print("converting corpus ...")
  step = 0
  fout = open('{}/{}'.format(trdata_dir, outfile), 'wb')
  for step, line in enumerate(trdata_tokenized):
    if not step % 1000:
      print("working on {}kth line".format(step // 1000), end='\r')

    #[DEBUG] make sure we are iterating through the right stuff here...
    if step < 10:
      print("word ids: ", step)
      print("orig. sentence: ", politics_headlines[step])
    
    sent = line
    # [edit] I may not need this...
    # if len(sent) <= window_size:
    #  continue
    for i in range(len(sent)):
      iword, owords = cbow(sent, i, window_size)
      a = np.array([iword] + owords, dtype=np.uint16)
      #debug
      if step < 10:
        print(a)
      fout.write(a.tobytes())

  fout.close()
  print("conversion done")



vocab size: 23187
vocab items: [('to', 1), ('the', 2), ('trump', 3), ('of', 4), ('in', 5), ('for', 6), ('a', 7), ('on', 8), ('is', 9), ('donald', 10)]


In [None]:
## actually run "preprocessing step 1"

# run the above function with the given parameters.
trdata = politics_headlines
trdata_dir = 'gdrive/MyDrive/humor_generation/cbow_news'
wids, word2id, id2word = tokenize_trdata(trdata, trdata_dir)

In [None]:
## actually run "preprocessing step 2"

window_size = 3  # try making this bigger?
tokens_to_nn_input(trdata_dir, trdata, wids, window_size)


# Start Training CBOW

Edit: Added GPU's & Batched GD. \{concern: may need to tune LR\}


Train on the output of preprocessing steps

In [None]:
# NOTE: before starting on this, you can either preprocess using the above functions, or load results from pickle

In [None]:
## specify hardware.
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
# goal of this cell: design training code that can run on gpu
# TODO: Verify training accuracy with some kind of "validation loss"

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import sys
pth = '/content/gdrive/MyDrive/humor_generation'
if not (pth in sys.path):
  sys.path.append(pth)
from cbow_helper import CBOWLanguageModeler

torch.manual_seed(1)

# this class should take a 2-d numpy array and return one row at a time.
class CBOWDataset(torch.utils.data.Dataset):
  def __init__(self, np_2d_array):
    super(CBOWDataset).__init__()
    self.data = np_2d_array
  def __getitem__(self, index):
    return self.data[index]
  def __len__(self):
    return self.data.shape[0]


# function to save the checkpoint + other metadata.
def save_ckp(ckp_path, epoch, losses, model, optimizer, 
             VOCAB_SIZE, CONTEXT_SIZE, batch_size, EMBEDDING_DIM):
  checkpoint = {
    'epoch': epoch,
    'losses': losses,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict(),
    # these are params for model initialization
    'vocab_size': VOCAB_SIZE,
    'context_size': CONTEXT_SIZE,
    'batch_size': batch_size,
    'embedding_dim': EMBEDDING_DIM
  }
  torch.save(checkpoint, ckp_path)


# actually start up the training process
def train_cbow(trdata_dir, VOCAB_SIZE, trdata_filename = 'train.bin', window_size = 3, 
               EMBEDDING_DIM = 100, batch_size=10, lr = 0.001, ckpt_start_path = None):
  epoch = -1

  # first, pull up the training data.
  context_size = 2*window_size + 1  # 2 context n-grams on each side, plus the center word
  fin = open(trdata_dir + trdata_filename, 'rb')
  data = np.fromfile(fin, dtype=np.uint16, count=-1)  # note: this produces a 1-d array.
  data = data.astype('int16')
  data = data.reshape((-1,context_size))

  # then, create a batch data loader.
  batched_data = DataLoader(CBOWDataset(data), batch_size)


  # finally, it's time to perform the training... Initialize the model.
  losses = []
  loss_function = nn.NLLLoss()
  CONTEXT_SIZE = 2*window_size
  model = CBOWLanguageModeler(VOCAB_SIZE, CONTEXT_SIZE, batch_size, EMBEDDING_DIM)
  optimizer = optim.SGD(model.parameters(), lr=lr)  # TODO: update this!! [don't use sgd]

  # if there is a saved model & optimizer state, load those up.
  if ckpt_start_path is not None:
    checkpoint = torch.load(ckpt_start_path)
    epoch = checkpoint['epoch']-1
    losses = checkpoint['losses']
    model_sd = checkpoint['state_dict']
    optimizer_sd = checkpoint['optimizer']
    model.load_state_dict(model_sd)
    optimizer.load_state_dict(optimizer_sd)

  #put this on the gpu
  model.to(device)

  # now we have to cycle through the training data
  ## note: adding batched processing.
  print("starting training process...")
  print("batch size:", batch_size)
  while epoch < 10:
    epoch += 1

    # print training progress...
    if epoch > 0:
      print(f"total loss for epoch #{epoch-1}: {total_loss}")
    print("\n#--------------------------------------------------------------#")
    print("\tstarting epoch #", epoch)

    total_loss = 0

    #for context, target in trigrams:
    for step, context_row in enumerate(batched_data):
      niter = int(1000/batch_size)
      if not step % niter:
        print(f"working on step #{step} of {data.shape[0]/batch_size}")
      center_wd = context_row[:,0]
      context = context_row[:,1:]

      # save the model. [save after 50k examples.]
      if step == int(50000/batch_size):
        print("saving model...")
        save_ckp(f'gdrive/MyDrive/humor_generation/cbow_news/checkpoint_{epoch}_{step}.pt',
         epoch, losses, model, optimizer,
         VOCAB_SIZE, CONTEXT_SIZE, batch_size, EMBEDDING_DIM) #these are the arguments to initialize the model.        
        # [DEBUG]: REMOVE THIS STATEMENT LATER!
        # break

      
      # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
      # into integer indices and wrap them in tensors)
      # [=== old ===] # context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
      context_idxs = torch.tensor(context, dtype=torch.long).to(device)

      # Step 2. reset the gradients.
      model.zero_grad()

      # Step 3. Run the forward pass
      log_probs = model(context_idxs)

      # Step 4. Compute your loss function. (Again, Torch wants the target
      # word wrapped in a tensor)
      center_wd_tsr = torch.tensor(center_wd, dtype=torch.long)
      loss = loss_function(log_probs.to('cpu'), center_wd_tsr)

      # Step 5. Do the backward pass and update the gradient
      loss.backward()
      optimizer.step()

      # Get the Python number from a 1-element Tensor by calling tensor.item()
      total_loss += loss.item()
      losses.append(total_loss)

  print(losses)  # The loss decreased every iteration over the training data!
  return model, losses


In [None]:

#-----------------------------------------------------------------------------#
### actually run the function
#-----------------------------------------------------------------------------#

trdata_dir = 'gdrive/MyDrive/humor_generation/cbow_news/'
trdata_filename = 'train.bin'
window_size = 3
EMBEDDING_DIM = 100
VOCAB_SIZE = 23187  # 23187 # len(word2id)
lr = 0.001

model, losses = train_cbow(trdata_dir, VOCAB_SIZE, trdata_filename, window_size, EMBEDDING_DIM)

Using these two articles to learn:
1. https://www.kdnuggets.com/2018/04/implementing-deep-learning-methods-feature-engineering-text-data-cbow.html

2. https://www.guru99.com/word-embedding-word2vec.html

# CBOW Inference

The model trained in the previous cells needs to be applied as part of the autmentation pipeline (using an inference script).
* Run it on training data and News Headlines data.
* In the future, create a "validation set" from training data.

In [None]:
# open up the incomplete news headlines, just for testing purposes.
import pandas as pd
import pickle as pkl

# load the training index pickles, and reconstruct some of the training examples.
id2word = pkl.load(open('gdrive/MyDrive/humor_generation/cbow_news/id2word.pkl', 'rb'))
word2id = pkl.load(open('gdrive/MyDrive/humor_generation/cbow_news/word2id.pkl', 'rb'))

# load the training / test data
pth = 'gdrive/MyDrive/humor_generation/datasets/news-headlines-humor-parsed.csv'
df = pd.read_csv(pth)

print(df.head())

      id  ...                   verb loc
0   1723  ...  start_char=34|end_char=43
1  12736  ...  start_char=19|end_char=28
2  12274  ...  start_char=15|end_char=21
3  12274  ...  start_char=44|end_char=53
4  12274  ...  start_char=44|end_char=53

[5 rows x 6 columns]


In [None]:
df['target text'].tolist()

In [None]:
# this function actually uses the model to make word predictions based on the context.
from keras.preprocessing import text
import torch
import numpy as np

import sys
pth = '/content/gdrive/MyDrive/humor_generation'
if not (pth in sys.path):
  sys.path.append(pth)
from cbow_helper import *


# load the trained CBOW model (defined in helper file) from model checkpoint file.
def load_model(model_path):
  # open the checkpoint & load the model.
  checkpoint = torch.load(model_path)
  epoch = checkpoint['epoch']-1
  losses = checkpoint['losses']
  model_sd = checkpoint['state_dict']
  if checkpoint.get('context_size') is None:
    window_size = 3
    CONTEXT_SIZE = 2*window_size
    EMBEDDING_DIM = 100
    VOCAB_SIZE = 23187  # 23187 # len(word2id)
    batch_size = 1  # we are going to feed a single example to the model at a time.
    lr = 0.001
  else:
    CONTEXT_SIZE = checkpoint['context_size']
    window_size = CONTEXT_SIZE/2
    EMBEDDING_DIM = checkpoint['EMBEDDING_DIM']
    VOCAB_SIZE = checkpoint['VOCAB_SIZE']
    lr = checkpoint['lr']
    batch_size = 1  # this I'm maintaining... I want stochastic inference...

  # todo in future: save these parameters in the checkpoint?

  model = CBOWLanguageModeler(VOCAB_SIZE, CONTEXT_SIZE, batch_size, EMBEDDING_DIM)
  model.load_state_dict(model_sd)

  print(f"Extracted model........ epoch {epoch}")
  print(f"Current training loss: {losses[-1] / VOCAB_SIZE}")
  return model, window_size


# helper function: for a given sentence, use the context to find cbow suggestions.
# inputs:
#     sentence: the actual input sentence to be augmented.
#     tgt_idx: the index of the word to augment?
#     window_size: number of words in context (on either side)
#     model: the trained CBOW model (needs to be loaded first)
#     word2id & id2word: indexes for the training dataset.
#     threshold: how many alternate words to return
# outputs:
#     words: list of alternate words that could belong in the sentence.
#     (return list of top 10 words, in order, suggested by the model.)
def context_alternate_preds(sentence, tgt_idx, window_size, model, word2id, id2word, thresh=10):
  # also need to deal with words not in the word index...
  sentence = [word2id.get(w) for w in sentence]
  sentence = [s if s else np.random.randint(len(word2id)) for s in sentence]  # can replace "0" with random integer.
  iword, cwords = cbow(sentence, tgt_idx, window_size)
  cwords = torch.tensor(cwords)
  # print("[debug] - cwords:", cwords)

  # send the input to the model, and convert the output to words...
  log_probs = model(cwords)  # run fwd prop.
  # print("examining output...")
  # print(f"log probs length: {len(log_probs)}")
  # print(f"log probs content: {log_probs}")

  # pull out the max 5-10 indices and find the actual words.
  lp = log_probs.detach().numpy()
  winds = np.argsort(lp*-1)[0][:thresh*5]
  words = [id2word[i] for i in winds]
  #filter out the stopwords & words in the sentence.
  print(len(words))
  words = list(words)
  words = [w for w in words if w not in sw]
  words = [w for w in words if w not in sentence]
  print("[debug] - top words:", words[:10])

  # print(words)
  return words[:thresh]




In [None]:
# In this cell, write a formal script to parse headlines from a file and generate alternate words.
import torch
import numpy as np
import json
import pandas as pd
import csv

import sys
pth = '/content/gdrive/MyDrive/humor_generation'
if not (pth in sys.path):
  sys.path.append(pth)
from cbow_helper import CBOWLanguageModeler


def augment_sentences(in_path, out_path, model, window_size):
  #step 0: load the model # [or just pass it in...]
  # model = load_model(model_path)

  #step 1: open file, extract headlines
  df_news = pd.read_csv(in_path)
  ids = df_news['id'].tolist()
  sent_list = df_news['sentence'].tolist()
  tgt_wds = df_news['target text'].tolist()
  vbs = df_news['verb text'].tolist()

  #step 2: pass headlines to the generator; save the top 3?
  # we need: 1) "hdl" variable with sentences; 2) "tgt_wds" with target words.
  with open(out_path, 'w') as fout:
    hdl_writer = csv.writer(fout)
    hdl_writer.writerow(['id', 'sentence', 'new_word', 'orig_word', 'target_word_idx' 'target_verb'])
    for e, hdl in enumerate(sent_list):
      hdl_tokenized = text.text_to_word_sequence(hdl, filters='\'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
      tgt = tgt_wds[e].lower()
      id = ids[e]
      vb = vbs[e]
      # isolate the context words & convert to NN input [create new fn. from this?]
      if tgt not in hdl_tokenized:
        print(f"skipping, tgt {tgt} & sent {hdl_tokenized}")
        continue
      tgt_idx = hdl_tokenized.index(tgt)
      alt_words = context_alternate_preds(hdl_tokenized, tgt_idx, window_size, model, word2id, id2word, thresh=10)  # perform NN predictions, thresh=3.
      #print("headline:", hdl)
      #print("target word:", tgt)
      #print("top alternate words:", alt_words)

      #create alternate headlines
      l = hdl_tokenized[:tgt_idx]
      r = hdl_tokenized[(tgt_idx+1):]

      # Todo: come up with a proper format and save below to a file.
      for w in alt_words:
        out_row = [id, ' '.join(l+[w]+r), w, tgt, tgt_idx, vb]  # ['id', 'sentence', 'new_word', 'orig_word', 'target_word_idx' 'target_verb']
        if e <= 10:
          print(out_row)  # todo: save this output to a file - csv format
        hdl_writer.writerow(out_row)


#---------------------------------------#
# actually run the above function

fin = '/content/gdrive/MyDrive/humor_generation/datasets/news-headlines-humor-parsed.csv'
fout = '/content/gdrive/MyDrive/humor_generation/cbow_news/in_process_data/alternate_preds.csv'
model_path = 'gdrive/MyDrive/humor_generation/cbow_news/checkpoint_8_5000.pt'
model, window_size = load_model(model_path)
augment_sentences(fin, fout, model, window_size)



# "Scratch" code

In [None]:
!ls '/content/gdrive/MyDrive/humor_generation/cbow_news/in_process_data'

In [None]:
# load some examples from the training data, delete their nouns manually, and perform inference
# (this is more exploratory, prev. cell will have a more formal script.)
'''
  Problem with the below:
    1. Pull out stopwords
    2. Pull out tokens already in the sentence
    3. Try training with more data (not just politics headlines), to get a greater variety of words...
      ** try training with larger context window... I think this could help...
      ** also try lemmatizing and removing stopwords before training?
    4. Need to correct grammar

'''


window_size = 3

n_examples = 10
hdls = df['Headline'].head(n_examples).tolist()
print(hdls)

# [singling out words to replace] this will have to be done automatically...
tgt_wds = ['prosecutor', 'suspects', 'reporter', 'missile', 'primary', 'scapegoat', 'trolling', 'iran', 'bomb', 'midterms']

'''
  - This gives me an idea... I'm singling out the words that need to be changed using *attention*
  - I should really be training a transformer to find the location of the funniest word...
  - Another idea: I think the "object" of the verb always tends to be the funniest... Try to single those out as well...
'''


#todo next: you need to use the same tokenizer used during training...
for e, hdl in enumerate(hdls):
  print("\n\n#---------------------------------------#\n")
  hdl_tokenized = text.text_to_word_sequence(hdl, filters='\'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
  tgt = tgt_wds[e]
  # isolate the context words & convert to NN input [create new fn from this?]
  tgt_idx = hdl_tokenized.index(tgt)
  alt_words = context_alternate_preds(hdl_tokenized, tgt_idx, window_size, model, word2id, id2word, thresh=10)
  print("headline:", hdl)
  print("target word:", tgt)
  print("top alternate words:", alt_words)


### NEXT MODIFICATION: Need to crawl examples ^^ from a file and run this inference script.