In [1]:
# data handling
import pandas as pd
import numpy as np
from collections import Counter

#word cleaning
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers

# load numpy array from csv file
from numpy import loadtxt

#for splitting our data
from sklearn.model_selection import train_test_split

#plotting performance
import matplotlib.pyplot as plt

import keras.utils as ku
import keras.backend as K

In [2]:
nltk.download('stopwords') #to remove common words
nltk.download('wordnet') #for WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#!pip install --upgrade tensorflow_hub

In [4]:
#used for transfer learning
import tensorflow_hub as hub

In [32]:
#get data from google drive
from google.colab import drive, files

In [5]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
#check that we are using GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


# Data Preparation
Custom functions to help with handling the data

## Text Encoder

In [7]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text): #ref:https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
    """
      outputs a cleaned string of text from an input string of text
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
    lemmatizer.lemmatize(text) # reduce to root word
    return text

def encode_text(corpus): #we pass in X and set y as next word
    tokenizer = Tokenizer()
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    max_sequence_len = 129 #max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, total_words, max_sequence_len, tokenizer

def encode_text1(X, y, total_words, seq_len=3):
  """
  Input txt and outputs tokenised text.
  encode_type: bow = encode documents into vector where the coefficient for 
  each token is based on counting words
  """
  encoder = Tokenizer()# create the tokenizer
  encoder.fit_on_texts(X)

  predictors = encoder.texts_to_sequences(X)
  predictors = pad_sequences(predictors, padding='post', truncating='post', maxlen=seq_len)

  label = encoder.texts_to_sequences(y)
  label = pad_sequences(label, padding='post', truncating='post', maxlen=seq_len)

  label = ku.to_categorical(y, num_classes=total_words)

  # summarize what was learned
  print("Predictors:\n")
  print(predictors)
  print(type(predictors))
  print(len(predictors))
  print(predictors.shape)
  return predictors, label, encoder
  
def decode_text(encoder, seq):
    seq_to_wrd=encoder.sequences_to_texts(seq)
    print("Numbers to Texts:", seq_to_wrd)

def train_val_test(X_features, y_target, tc=237, pt_embed=False):
    """
    Splits dataset into 10% for testting, 10% for validation and 
    the remaining 80% for training data. By first spliting data for 
    training and test data (90:10), then training with validation.

    returns X_train, X_test, X_val, max_words, seq_len
    """
    # Reduce word amount to speed up training
    truncate = tc #592 #1183 #2368 #6000 # 23677 
    mxw = 23681 #our vocab value we found during data cleaning
    sl = truncate

    X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(X_features, y_target, test_size=0.1, random_state=42)

    X_train_set, X_val_set, y_train_set, y_val_set = train_test_split(X_train_set, y_train_set, test_size=0.1111, random_state=42)

    print("** Before Truncate **")
    print("Training data shape:", X_train_set.shape)
    print("Training labels shape:", y_train_set.shape)
    print("\n")
    print("Validation data shape:", X_val_set.shape)
    print("Validation labels shape:", y_val_set.shape)
    print("\n")
    print("Test data shape:", X_test_set.shape)
    print("Test labels shape:", y_test_set.shape)

    if(pt_embed):
      return X_train_set, X_test_set, X_val_set, y_train_set, y_val_set, y_test_set, mxw, sl
    else:
      X_train_set = X_train_set[:, :truncate]
      X_test_set = X_test_set[:, :truncate]
      X_val_set = X_val_set[:, :truncate]

      print("\n")
      print("** After Truncate: " + str(truncate) + " **")
      print("Training data shape:", X_train_set.shape)
      print("Training labels shape:", y_train_set.shape)
      print("\n")
      print("Validation data shape:", X_val_set.shape)
      print("Validation labels shape:", y_val_set.shape)
      print("\n")
      print("Test data shape:", X_test_set.shape)
      print("Test labels shape:", y_test_set.shape)

      return X_train_set, X_test_set, X_val_set, y_train_set, y_val_set, y_test_set, mxw, sl

## Model Builder

In [23]:
def build_generative_lstm(total_words, input_len): #bodyContent: can have 0 to 59,714 per example
    model = tf.keras.Sequential()
    # Add Input Embedding Layer
    model.add(layers.Embedding(total_words, 10, input_length=input_len))
    # Add Hidden Layer 1 - LSTM Layer
    model.add(layers.LSTM(8, return_sequences=True))
    model.add(layers.Dropout(0.1))
    model.add(layers.LSTM(8))
    #output layer
    model.add(layers.Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    model.summary()
    return model


## Perplexity Metric

In [None]:

def perplexity(y_true, y_pred):
    """
    Calculates perplexity metric.
    """
    cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
    perplexity = K.pow(2.0, K.mean(cross_entropy))
    return perplexity

## Text Generator

In [45]:
def generate_text(model, start_string, encoder, num_generate):
    for _ in range(num_generate):
        token_list = encoder.texts_to_sequences([start_string])[0]
        token_list = pad_sequences([token_list], maxlen=128, padding='pre') 
        classes = model.predict(token_list, verbose=0)
        predicted=np.argmax(classes,axis=1)

        output_word = ""
        for word,index in encoder.word_index.items():
            if index == predicted:
                output_word = word
                break
        start_string += " "+output_word

    # Calculate the perplexity of the generated text
    #true_next_words = token_list[1:]
    #perplexity_score = perplexity(true_next_words, predicted.reshape(-1, len(encoder.word_index)))
    #print("Perplexity score:", perplexity_score)

    return start_string.title()

## Load prepared dataset

In [10]:
df = pd.read_csv('/content/drive/MyDrive/data/guardian_articles_10_perc.csv') #for colab only
#df = pd.read_csv('./data/guardian_articles_10_perc.csv')

In [11]:
df.head()

Unnamed: 0,webTitle,bodyContent,sectionName
0,Saido Berahino has right attitude but he’s not...,Tony Pulis hopes his only transfer business be...,Football
1,Angelique Kerber now aims to dislodge Serena W...,Gone midnight and Angelique Kerber was conduct...,Sport
2,The family building a refugee haven in the sha...,On 9 June 2014 Queenslander and charity worker...,World news
3,Exeter keep Saracens in their sights with bonu...,There is no need for calculators this week but...,Sport
4,Exposed: photography's fabulous fakes,"In 1840, Hippolyte Bayard, a pioneer of early ...",Art and design


In [12]:
df['webTitle'] = df['webTitle'].astype(str)

In [13]:
df['bodyContent'] = df['bodyContent'].astype(str)

In [14]:
#drop the columns we don't need for further analysis/modelling
df = df.drop(columns=['sectionName'])

In [15]:
df.iloc[:, 0] = df.iloc[:,0].apply(clean_text)

In [16]:
df.iloc[:, 1] = df.iloc[:,1].apply(clean_text)

In [17]:
# Find how many words per row a.k.a the sequence length, but exclude counting spaces
# we will use this to pad the output
seq_len_wt = int(df.iloc[:, 0].map(len).max())
seq_len_wt_bc = int(df.iloc[:, 1].map(len).max())
print("WebTitle: " + str(seq_len_wt) + ", bodyContent: " + str(seq_len_wt_bc)) #max length of each example

WebTitle: 128, bodyContent: 59714


In [18]:
X = df.webTitle.values #len 128
#X = df.bodyContent.values #len 59714


# print the array
print(X)
print(X.shape)

['saido berahino right attitude hes fit says west broms pulis'
 'angelique kerber aims dislodge serena williams world no1 spot'
 'family building refugee shadow isis' ...
 'whistleblowers inside un review horrific tale misogyny rape 10 000 deaths'
 'tokyo mayoral win huge surprise candidate lately arrived belgium'
 'marble head hercules pulled roman shipwreck site greece']
(14983,)


## Tokenise Data

In [19]:
predictors, label, total_words, max_sequence_len, encoder = encode_text(X) #592 #1183 #2368 #6000 # 23677 #max_len

In [20]:
predictors.shape

(104694, 128)

In [21]:
label.shape

(104694, 23677)

## LSTM Text Generative Model

In [24]:
LSTM_gen_model = build_generative_lstm(total_words, 128)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 10)           236770    
                                                                 
 lstm (LSTM)                 (None, 128, 8)            608       
                                                                 
 dropout (Dropout)           (None, 128, 8)            0         
                                                                 
 lstm_1 (LSTM)               (None, 8)                 544       
                                                                 
 dense (Dense)               (None, 23677)             213093    
                                                                 
Total params: 451,015
Trainable params: 451,015
Non-trainable params: 0
_________________________________________________________________


In [25]:
LSTM_gen_model.fit(predictors, label, epochs=10) #total_words

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6261a13a90>

In [26]:
seed_text = "Kanye West scandal over problematic advertising campaign"""

In [46]:
predict_next_words = 10
generate_text(LSTM_gen_model, seed_text, encoder, predict_next_words)

'Kanye West Scandal Over Problematic Advertising Campaign New Says John Crace Wilson Williams Crace Williams Wilson Williams'

In [28]:
LSTM_gen_model.save('/content/LSTM_gen_model')



In [29]:
!zip -r LSTM_gen_model.zip LSTM_gen_model

  adding: LSTM_gen_model/ (stored 0%)
  adding: LSTM_gen_model/keras_metadata.pb (deflated 90%)
  adding: LSTM_gen_model/assets/ (stored 0%)
  adding: LSTM_gen_model/fingerprint.pb (stored 0%)
  adding: LSTM_gen_model/variables/ (stored 0%)
  adding: LSTM_gen_model/variables/variables.index (deflated 60%)
  adding: LSTM_gen_model/variables/variables.data-00000-of-00001 (deflated 10%)
  adding: LSTM_gen_model/saved_model.pb (deflated 91%)


In [33]:
files.download("/content/LSTM_gen_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>