# Import dependencies

In [None]:
import nltk
from os import listdir
import time
from nltk.corpus import wordnet
import re
import pickle
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import pandas as pd
from pathlib import Path

# Config & Hyper Parameters

In [None]:
base_path = 'G:\\AI\\data\\cnn\\'
path = base_path + 'sample_5k\\'
articles_pickle_filename = "articles.pickle"
headlines_pickle_filename = "headlines.pickle"
model_pickle_filename = "model.pickle"
word_embedding_matrix_filename = "word_embedding_matrix.pickle"

''' https://fasttext.cc/docs/en/english-vectors.html '''
model_path ='G:\Python\MLLearning\MachineLearning\data\wiki-news-300d-1M.vec'

# to avoid words that are used less that threshold value
threshold = 2

# Dimension size as per pre-trained data
embedding_dim = 300
max_text_length = 1000
max_summary_length = 20
min_length = 2
unk_text_limit = 200

# Set the Hyperparameters
epochs = 100
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75

# Stopword list and Initialize Lemmatizer

In [None]:
nltk.download('stopwords')

stop_words = nltk.corpus.stopwords.words('english')
lmtzr = nltk.WordNetLemmatizer().lemmatize


# Read files and load into memory

In [None]:
def load_files(filename):
    # open the file as read only
    file = open(filename, encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# Split a document into news article body and headlines

In [None]:
def split_data(doc):
    # find first headlines
    index = doc.find('@highlight')
    # split into story and headlines
    article, headlines = doc[:index], doc[index:].split('@highlight')
    # strip extra white space around each highlight
    headlines = [h.strip() for h in headlines if len(h) > 0]
    return article, headlines

# Clean a list of lines
This section is used to remove unwanted words and return cleaned articles and headlines.

In [None]:
def clean_text(lines, remove_stopwords=True):
    
    cleaned = list()
    for line in lines:
        # strip source cnn office if it exists
        index = line.find('(CNN)  -- ')
        if index > -1:
            line = line[index + len('(CNN)'):]
        else:
            index = line.find('(CNN)')
            if index > -1:
                line = line[index + len('(CNN)'):]

        # tokenize on white space
        line = line.split()

        # convert to lower case
        line = [word.lower() for word in line]

        # Optionally, remove stop words
        if remove_stopwords:
            line = [w for w in line if w not in stop_words]

        # remove punctuation from each token
        #line = [w.translate(table) for w in line]

        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]

        # Format words and remove unwanted characters
        text = " ".join(line)
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text)
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)

        # remove empty strings
        if len(text )> 0 :
            cleaned.append(text)

    return cleaned

# Normalization of data using Lemmatization
Lemmatization is used as it returns better words choice than stemming as Lemmatization returns only valid dictionary(wordnet) words. Trade is it takes more time.

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def normalize_text(text):
    cleaned = list()

    for line in text :
        word_pos = nltk.pos_tag(nltk.word_tokenize(line))
        lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]

        word = [x.lower() for x in lemm_words]
        cleaned.append(' '.join(word))

    return cleaned

# Load all stories in a directory
This is used to load and clean the learn and test dataset. After cleaning data it returns two list cleaned articles and cleaned headlines.

In [None]:
def load_stories(location):
    stories = list()
    file_list = listdir(location)
    total_files = len (file_list)
    count = 0
    print ("Total Files : {total_files}".format(total_files= total_files))
    clean_articles = []
    clean_headlines = []
    for name in file_list:
        count = count + 1
        filename = location + '/' + name
        # load document
        print('Loading  - {filename}, files number  - {count},  out of - {total_files}'
              .format(filename=filename, count=count, total_files=total_files))
        doc = load_files(filename)
        # split into story and highlights
        article, headlines = split_data(doc)
        # store
        #stories.append({'article': article, 'headlines' : headlines})

        article = clean_text(article.split('\n'))
        article = normalize_text(article)
        clean_articles.append(' '.join(article))
        headlines = clean_text(headlines, remove_stopwords=False)
        headlines = normalize_text(headlines)
        clean_headlines.append(' '.join(headlines))

    return clean_articles, clean_headlines

# Main Program
Start point of data cleaning, once the articles and headlines are cleaned, they dumped so that can be reused for vectorization and then running model directly. This is becasue cleaning is an expensive operation in terms of time and resources. 

In [None]:
def main():
    start = time.perf_counter()
    clean_articles, clean_headlines = load_stories(path)
    print("Total Articles  : {len_articles} , Total Headlines : {len_headlines}- Time Taken : {time_taken}"
          .format(len_articles=len(clean_articles), len_headlines =
                  len(clean_headlines), time_taken = (time.perf_counter()-start)/60))

    print ("Serialization of articles")
    # Store Articles (serialize)
    with open(base_path + articles_pickle_filename, 'wb') as handle:
        pickle.dump(clean_articles, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Serialization of headlines")
    # Store Articles (serialize)
    with open(base_path + headlines_pickle_filename, 'wb') as handle:
        pickle.dump(clean_headlines, handle, protocol=pickle.HIGHEST_PROTOCOL)


'''-------------------------main------------------------------'''
main()


# Load Pre-trained English word embedding

This is used to load pre-trained english word embedding 'fast text' provided facebook. First it checks if pre-trained model dump already exists, if not it load model and put in it dump. Dump is created becasue it loads faster than actual word embedding model.
https://fasttext.cc/docs/en/english-vectors.html

In [None]:
def create_or_load_model():
    model_pickle = Path(config.base_path + config.model_pickle_filename)
    if model_pickle.exists():
        print("Loading Pre-Trained Model Pickle..... ")
        start = time.perf_counter()
        with open(config.base_path + config.model_pickle_filename, 'rb') as handle:
            model = pickle.load(handle)
        print("Loaded Pre-Trained Model Pickle, time taken", ((time.perf_counter() - start) / 60))
    else:
        print("Loading Pre-Trained Model  ..... ")
        start = time.perf_counter()
        model = KeyedVectors.load_word2vec_format(config.model_path, binary=False)
        with open(config.base_path + config.model_pickle_filename, 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print("Loaded Pre-Trained Model, time taken", ((time.perf_counter() - start) / 60))
    return model

# count_words

This is a utility method used to count how many times a word is used. 

In [2]:
def count_words(count_dict, text):
    ''' Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

# vectorization

This is used to get word embedding for each word from pre-trained model

In [None]:
def vectorization(text, embeddings_index, model):
    for sentence in text:
        try:
            for vocab_word in sentence.split():
                embeddings_index[vocab_word] = model[vocab_word]              
                # print("Work : {vocab_word} , vector value : {vector_value}".
                #format(vocab_word=vocab_word, vector_value =vector_value))
        except KeyError:
            '''ignore'''
            # print("{vocab_word} not in vocabulary".format(vocab_word=vocab_word))

# missing_word_ratio

Find the number of words that are missing from CN, and are used more than our threshold.

In [None]:
def missing_word_ratio(word_counts, embeddings_index):
    ''' Find the number of words that are missing from CN, and are used more than our threshold.'''
    missing_words_count = 0
    missing_words = list()

    for word, count in word_counts.items():
        if word not in embeddings_index and word not in missing_words and count > threshold:
            missing_words_count += 1
            missing_words.append(word)
            # print("{word} is missing ".format(word=word))

    missing_ratio = round(missing_words_count / len(word_counts), 4) * 100
    return missing_ratio, missing_words_count

# covert_vocab_to_int

This is used to covert each word in training set to word vector. This is important as ML algorithm can only understand numbers. This integer representation of word is later passed encoder for word processing.

In [None]:
def covert_vocab_to_int(word_counts, embeddings_index):
    # dictionary to convert words to integers
    vocab_to_int = {}

    value = 0
    for word, count in word_counts.items():
        if count > threshold or word in embeddings_index:
            vocab_to_int[word] = value
            value += 1

    # Special tokens that will be added to our vocab
    codes = ["<UNK>", "<PAD>", "<EOS>", "<GO>"]

    # Add codes to vocab
    for code in codes:
        vocab_to_int[code] = len(vocab_to_int)

    # Dictionary to convert integers to words
    int_to_vocab = {}
    for word, value in vocab_to_int.items():
        int_to_vocab[value] = word

    usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) * 100

    print("Total number of unique words:", len(word_counts))
    print("Number of words we will use:", len(vocab_to_int))
    print("Percent of words we will use: {}%".format(usage_ratio))

    return vocab_to_int

# create_combine_word_matrix

Need to use 300 for embedding dimensions to match corpus(input data) vectors.
This will return cobine matriz that would have 'embeddings_index' for from pre-trained word embedding plus 
random embedding generated for words missing in pre-trained word embedding.

In [None]:
def create_combine_word_matrix(vocab_to_int, embeddings_index):
    
    nb_words = len(vocab_to_int)
    # Create matrix with default values of zero
    word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
    for word, i in vocab_to_int.items():
        if word in embeddings_index:
            word_embedding_matrix[i] = embeddings_index[word]
        else:
            # If word not in CN, create a random embedding for it
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
            embeddings_index[word] = new_embedding
            word_embedding_matrix[i] = new_embedding

    # Check if value matches len(vocab_to_int)
    print("word_embedding_matrix length : ", len(word_embedding_matrix))
    return word_embedding_matrix

# Finding unknown words

Convert words in text to an integer. If word is not in vocab_to_int, use UNK's integer.
Total the number of words and UNKs. Add EOS token to the end of texts.

In [None]:
def convert_to_ints(text, vocab_to_int, eos=False):    
    ints = []
    word_count = 0
    unk_count = 0
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                # print("UNK Word : ", word)
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)

    unk_percent = round(unk_count / word_count, 4) * 100

    print("Total number of words : ", word_count)
    print("Total number of UNKs : ", unk_count)
    print("Percent of words that are UNK: {}%".format(unk_percent))

    return ints, word_count, unk_count


def create_dataFrame(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])


def unk_counter(sentence, vocab_to_int):
    '''Counts the number of time UNK appears in a sentence.'''
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

# Sorting training dataset

Sort the summaries and texts by the length of the texts, shortest to longest. 
This is required so that batch provided to tensorflow will have lesser padding as sentences would be of same size.
Limit the length of summaries and texts based on the min and max ranges. This is to avoid out of range data.
Remove reviews that include too many UNKs as they would not provide much of learning experience.

In [None]:
def sort_corplus(lengths_articles, int_rep_articles, int_rep_headlines, vocab_to_int):

    sorted_articles = []
    sorted_headlines = []
    max_text_length = config.max_text_length
    max_summary_length = config.max_summary_length
    min_length = config.min_length
    unk_text_limit = config.unk_text_limit
    unk_summary_limit = 0

    for count, words in enumerate(int_rep_articles):
        if (len(int_rep_articles[count]) >= min_length and len(int_rep_articles[count]) <= max_text_length
            and unk_counter(int_rep_headlines[count], vocab_to_int) <= unk_summary_limit and
                    unk_counter(int_rep_articles[count], vocab_to_int) <= unk_text_limit):
            sorted_headlines.append(int_rep_headlines[count])
            sorted_articles.append(int_rep_articles[count])

    # Compare lengths to ensure they match
    print(len(sorted_headlines))
    print(len(sorted_articles))

    return sorted_articles, sorted_headlines


# Create input for Tensorflow graph

For using tensorflow we need to provide below input paramters and create_input_for_graph() is used to generate these variables.

clean_articles -> articles after removing impurities
sorted_articles -> articles sorted as the thr length
sorted_headline -> headlines (sorted as per article length) as the thr length
vocab_to_int -> interger values of all vocab words
word_embedding_matrix -> 300 dim matrix for each word in vocab

In [None]:
def create_input_for_graph():
    # Load data (deserialize)
    with open(config.base_path + config.articles_pickle_filename, 'rb') as handle:
        clean_articles = pickle.load(handle)

    with open(config.base_path + config.headlines_pickle_filename, 'rb') as handle:
        clean_headlines = pickle.load(handle)

    pre_trained_model = create_or_load_model()

    word_counts = {}
    print("counting  Articles")
    count_words(word_counts, clean_articles)
    print("counting  Headlines")
    count_words(word_counts, clean_headlines)

    print("Total Stories : ", len(clean_headlines))
    print("Size of Vocabulary:", len(word_counts))

    print("creating embedding index .....")
    embeddings_index = {};
    vectorization(clean_articles, embeddings_index, pre_trained_model)
    vectorization(clean_headlines, embeddings_index, pre_trained_model)
    print('Word embeddings:', len(embeddings_index))

    # find out missing words and thr %
    missing_ratio, missing_words_count = missing_word_ratio(word_counts, embeddings_index)

    print("Number of words missing :", missing_words_count)
    print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

    '''dictionary to convert words to integers - This is to found total words count that we get from aur corpus(input date)
    and out of that what % of words we would be using. This is after removing words that count less than threshold'''
    vocab_to_int = covert_vocab_to_int(word_counts, embeddings_index)

    word_embedding_matrix = create_combine_word_matrix(vocab_to_int, embeddings_index)

    # Apply convert_to_ints to clean_articles and clean_headlines
    print("Article Data")
    int_repr_articles, word_article_count, unk_article_count = convert_to_ints(clean_articles, vocab_to_int, eos=True)

    print("Headline Data")
    int_repr_headlines, word_headline_count, unk_headline_count = convert_to_ints(clean_headlines, vocab_to_int)

    lengths_articles = create_dataFrame(int_repr_articles)
    # lengths_headlines = create_dataFrame(int_repr_headlines)

    sorted_articles, sorted_headlines = sort_corplus(lengths_articles, int_repr_articles,
                                                     int_repr_headlines, vocab_to_int)

    return vocab_to_int, word_embedding_matrix