In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
import random
import re
import sklearn.model_selection as sk_ModelSelection
#increase field limit to read embedding
import sys
!git clone https://github.com/guol1nag/GeneratingHeadline_GANs.git
%cd GeneratingHeadline_GANs
%run ./Code/Models/CNN_text_clf.py
%run ./Code/Models/discriminator_training_class.py
%run ./Code/data2PaddedArray.py
%run ./Code/text_preprocessing.py
%run ./Code/contractions.py
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name() if torch.cuda.is_available() else "cpu"

Cloning into 'GeneratingHeadline_GANs'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (57/57), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 486 (delta 23), reused 42 (delta 11), pack-reused 429[K
Receiving objects: 100% (486/486), 16.98 MiB | 37.56 MiB/s, done.
Resolving deltas: 100% (217/217), done.
/content/GeneratingHeadline_GANs


'cpu'

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
path = r'/content/drive/My Drive/pre_train_weight.csv'
pre_train_weight = np.loadtxt(path)
gc.collect()

5

# pre-train embedding & pre-processing

pretrained_weight is a numpy matrix of shape (num_embeddings, embedding_dim)


In [3]:
%%time
data = pd.read_csv('/content/drive/My Drive/wikihowSep.csv',
                   error_bad_lines = False).astype(str)
print(data.shape)

for item in ['text', 'headline']:
  exec("""{}_data = text_preprocessing(data=data, item = '{}', contraction_map=CONTRACTION_MAP,
                                  drop_digits=False, remove_stopwords=False, stemming=False)""".format(item, item),
       locals(), globals()
  )

max_examples = 150000
max_threshold = 0.75

# drop examples with an invalid ratio of length of text and headline
text_len = [len(t) for t in text_data]
head_len = [len(h) for h in headline_data]

ratio = [h/t for t, h in zip(text_len, head_len)]

problems1 = [problem for problem, r in enumerate(ratio) if (r > max_threshold)]
text_data, headline_data = np.delete(text_data, problems1), np.delete(headline_data, problems1)
print("Number of examples after filtering: {:.0f}".format(text_data.shape[0]))

# drop too long articles (to avoid struggles with CUDA memory) and too short
text_len = [len(t) for t in text_data]

problems2 = [problem for problem, text_length in enumerate(text_len) if ((text_length > 200) | (text_length < 10) )]
text_data, headline_data = np.delete(text_data, problems2), np.delete(headline_data, problems2)
print("Number of examples after filtering: {:.0f}".format(text_data.shape[0]))

# drop too pairs with too short/long summaries
head_len = [len(h) for h in headline_data]

problems3 = [problem for problem, headline_len in enumerate(head_len) if ( (headline_len > 75) | (headline_len < 2) )]
text_data, headline_data = np.delete(text_data, problems3), np.delete(headline_data, problems3)
print("Number of examples after filtering: {:.0f}".format(text_data.shape[0]))

# some cleaning
del text_len, head_len, ratio, problems1, problems2, problems3
gc.collect()

"""
# trim the data to have only a subset of the data for our project
try:
  data = data[:max_examples]
except:
  pass
"""
# drop examples with an invalid ratio of length of text and headline
text_len = [len(t) for t in text_data]
head_len = [len(h) for h in headline_data]

(1585695, 5)
Number of examples after filtering: 1259273
Number of examples after filtering: 1214567
Number of examples after filtering: 1214535
CPU times: user 4min 34s, sys: 8.18 s, total: 4min 42s
Wall time: 4min 50s


In [0]:
np.random.seed(222)

split = np.random.uniform(0, 1, size = text_data.shape[0])

# Train set
text_train, headline_train = text_data[split <= 0.9], headline_data[split <= 0.9]
# Validation set
text_val, headline_val = text_data[(split > 0.9) & (split <= 0.95)], headline_data[(split > 0.9) & (split <= 0.95)]
# Test set
text_test, headline_test = text_data[split > 0.95], headline_data[split > 0.95]

del data
gc.collect()

def sort_data(text, headline):
  """
  """
  headline = np.array(
      [y for x,y in sorted(zip(text, headline), key = lambda pair: len(pair[0]), reverse = True)]
  )
  text = list(text)
  text.sort(key = lambda x: len(x), reverse = True)
  text = np.array(text)

  ####### i just want 100 samples!! ###########
  return text[:100], headline[:100]

# Train set
text_train, headline_train = sort_data(text_train, headline_train)
# Validation set
text_val, headline_val = sort_data(text_val, headline_val)
# Test set
text_test, headline_test = sort_data(text_test, headline_test)



In [5]:
class LangDict:
  """
  Source: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
  """
  def __init__(self):
    self.word2index = {}
    self.word2count = {}
    self.index2word = {0: "sos", 1: "eos"}
    self.n_words = 2

  def add_article(self, article):
    for word in article:
      self.add_word(word)

  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1
    else:
      self.word2count[word] += 1
# Create dictionary based on the training data
text_dictionary = LangDict()
headline_dictionary = LangDict()

for article in text_train:
  text_dictionary.add_article(article)

for article in headline_train:
  headline_dictionary.add_article(article)
print("There are {:.0f} distinct words in the untrimmed text dictionary".format(len(text_dictionary.word2index.keys())))
print("There are {:.0f} distinct words in the untrimmed headline dictionary".format(len(headline_dictionary.word2index.keys())))

# Trim a dictionary to the words with at least 10 occurences within the text
text_min_count = 1
head_min_count = 2

## TEXT DICTIONARY
subset_words = [word for (word, count) in text_dictionary.word2count.items() if count >= text_min_count]
text_dictionary.word2index = {word: i for (word, i) in zip(subset_words, range(len(subset_words)))}
text_dictionary.index2word = {i: word for (word, i) in zip(subset_words, range(len(subset_words)))}
text_dictionary.word2count = {word: count for (word, count) in text_dictionary.word2count.items() if count >= text_min_count}

## HEADLINE DICTIONARY
subset_words = [word for (word, count) in headline_dictionary.word2count.items() if count >= head_min_count]
headline_dictionary.word2index = {word: i for (word, i) in zip(subset_words, range(len(subset_words)))}
headline_dictionary.index2word = {i: word for (word, i) in zip(subset_words, range(len(subset_words)))}
headline_dictionary.word2count = {word: count for (word, count) in headline_dictionary.word2count.items() if count >= head_min_count}

print("There are {:.0f} distinct words in the trimmed text dictionary, where only word with at least {:.0f} occurences are retained".format(len(text_dictionary.word2index.keys()), text_min_count))
print("There are {:.0f} distinct words in the trimmed headline dictionary, where only word with at least {:.0f} occurences are retained".format(len(headline_dictionary.word2index.keys()), head_min_count))
del text_min_count, head_min_count, subset_words
## TEXT DICTIONARY
pad_idx = max(list(text_dictionary.index2word.keys())) + 1

text_dictionary.word2index['<pad>'] = pad_idx
text_dictionary.index2word[pad_idx] = '<pad>'

print(len(text_dictionary.index2word.keys()))

## HEADLINE DICTIONARY
pad_idx = max(list(headline_dictionary.index2word.keys())) + 1

headline_dictionary.word2index['<pad>'] = pad_idx
headline_dictionary.index2word[pad_idx] = '<pad>'

print(len(headline_dictionary.index2word.keys()))

There are 3573 distinct words in the untrimmed text dictionary
There are 342 distinct words in the untrimmed headline dictionary
There are 3573 distinct words in the trimmed text dictionary, where only word with at least 1 occurences are retained
There are 76 distinct words in the trimmed headline dictionary, where only word with at least 2 occurences are retained
3574
77


In [0]:
# Train set
text_train, text_lengths_train, headline_train, headline_lengths_train = data2PaddedArray(text_train, headline_train, {'text_dictionary': text_dictionary,
                                                                                                                       'headline_dictionary': headline_dictionary},
                                                                                          pre_train_weight)
# Validation set
text_val, text_lengths_val, headline_val, headline_lengths_val = data2PaddedArray(text_val, headline_val, {'text_dictionary': text_dictionary,
                                                                                                           'headline_dictionary': headline_dictionary},
                                                                                  pre_train_weight)
# Test set
text_test, text_lengths_test, headline_test, headline_lengths_test = data2PaddedArray(text_test, headline_test, {'text_dictionary': text_dictionary,
                                                                                                                 'headline_dictionary': headline_dictionary},
                                                                                       pre_train_weight)

##### have a look at data, make X and y for discriminator

In [35]:
print('number of sample; length of summary')
X_train = torch.from_numpy(np.transpose(headline_train)).long()
X_train.shape

number of sample; length of summary


torch.Size([100, 33])

In [40]:
y_train = (torch.rand(100) > 0.5).long()
y_train.size()

torch.Size([100])


# discriminator

In [0]:
# Discriminator_utility.show_parameter()

In [0]:
param = {'max_epochs':64,
        'learning_rate':1e-3,
        'batch_size':5,               
        'seq_len': 20,                   # length of your summary
        'embed_dim': 200,
        'drop_out': 0,
        'kernel_num': 5,                 # number of your feature map
        'in_channel': 1,                 # for text classification should be one
        # how many conv net are used in parallel in text classification
        'parallel_layer':3,
        'model_name': 'discriminator',
        'device':device}
embedding = pre_train_weight

In [0]:
drt = Discriminator_utility(embedding,**param)

In [14]:
drt.run_epochs(X_train,y_train,X_test = X_train,y_test = y_train)

Epoch: 1:
Train Loss: 14.112
Validation Loss: 13.381
Epoch: 2:
Train Loss: 13.291
Validation Loss: 12.713
Epoch: 3:
Train Loss: 12.620
Validation Loss: 12.142
Epoch: 4:
Train Loss: 12.064
Validation Loss: 11.481
Epoch: 5:
Train Loss: 11.411
Validation Loss: 10.838
Epoch: 6:
Train Loss: 10.747
Validation Loss: 10.160
Epoch: 7:
Train Loss: 10.125
Validation Loss: 9.538
Epoch: 8:
Train Loss: 9.493
Validation Loss: 8.953
Epoch: 9:
Train Loss: 8.936
Validation Loss: 8.306
Epoch: 10:
Train Loss: 8.323
Validation Loss: 7.739
Epoch: 11:
Train Loss: 7.772
Validation Loss: 7.245
Epoch: 12:
Train Loss: 7.341
Validation Loss: 6.861
Epoch: 13:
Train Loss: 6.911
Validation Loss: 6.597
Epoch: 14:
Train Loss: 6.652
Validation Loss: 6.183
Epoch: 15:
Train Loss: 6.301
Validation Loss: 5.720
Epoch: 16:
Train Loss: 5.868
Validation Loss: 5.450
Epoch: 17:
Train Loss: 5.585
Validation Loss: 5.254
Epoch: 18:
Train Loss: 5.367
Validation Loss: 4.981
Epoch: 19:
Train Loss: 5.125
Validation Loss: 4.724
Epoch: 2

KeyboardInterrupt: ignored