In [309]:
import datetime as dt
import pickle
import sys
from collections import Counter
from pathlib import Path
from typing import Any, Dict, Iterable, List, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [310]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
# %config InlineBackend.figure_format = 'svg'
plt.style.use('raph-base')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('precision', 2)
pd.set_option('display.max_columns', 30)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [311]:
print(sys.executable)
print(sys.version)

C:\Users\r2d4\miniconda3\envs\py3\python.exe
3.8.3 (default, May 19 2020, 06:50:17) [MSC v.1916 64 bit (AMD64)]


## Load & Check Text Corpus

In [312]:
# load text data

def load_text_data(path: Path, encoding: str="UTF-8") -> str:
    """Load textcorpus from file into a string.
    Replace double line breaks with simple break.
    """
    text_path = Path(path)
    with text_path.open(mode='r'):
        corpus = text_path.read_text(encoding=encoding)
        corpus = corpus.replace("\n\n", "\n")
        return corpus

corpus = load_text_data("../data/training/text/text_corpus.txt")

In [313]:
# Print some stats
print(f"Total number of words: {len(corpus.split()):,.0f}")
print(f"Approx. number of unique words: {len({word: None for word in corpus.split()}):,.0f}")

lines = corpus.split('\n')
print(f"Number of lines: {len(lines):,.0f}")
word_count_line = [len(line.split()) for line in lines]
print(f"Average number of words in each line: {round(np.average(word_count_line),0):,.0f}")

Total number of words: 282,312
Approx. number of unique words: 21,947
Number of lines: 1,836
Average number of words in each line: 154


In [314]:
# Print some sample lines

VIEW_LINE_RANGE = (0, 3)

print("\nThe lines {} to {}:\n".format(*VIEW_LINE_RANGE))
print("\n\n".join(corpus.split("\n")[VIEW_LINE_RANGE[0]:VIEW_LINE_RANGE[1]+1]))


The lines 0 to 3:

Robin Dreyer is a management consultant in Implement’s commercial excellence team in Zurich. Robin has experience with strategic marketing as well as sales planning and execution and specialise in automated digital marketing and lead management. Before joining Implement, Robin worked for swiss watch manufacturer IWC in the marketing department, where he was responsible for public relations monitoring and several strategic initiatives. During his time at Tesla, he gained first-hand sales experience and supported the setup and launch of a CRM system.

Tesla Inc., Sales Intern (Hamburg) (2017) Custom Programs (ES-HSG), Student Assistant (2015-2016) IWC Schaffhausen, Marketing Intern (2014-2015)

CEMS M.S. in International Management, IVEY Business School, London Ontario (2017) MA in Business Management, University of St. Gallen (2016) BA in Business Administration, University of St. Gallen (2014)

Professional services (2019-): Aligning and structuring service level ag

## Build pre-processing Pipeline

In [315]:
TOKEN_LOOKUP = {".": "<PERIOD>",
                ",": "<COMMA>",
                '"': "<QUOTDOUBLE>",
                "'": "<QUOTSINGLE>",
                ":": "<COLON>",
                ";": "<SEMICOLON>" ,
                "!": "<EXCLAMATIONMARK>",
                "?": "<QUESTIONMARK>",
                "(": "<LEFTPAREN>",
                ")": "<RIGHTPAREN>",
                "-": "<DASH>",
                "?": "<QUESTIONMARK>",
                "\n": "<NEWLINE>",
                "_": "<UNDERSCORE>",
                "PADDING": '<PAD>'
                }

VIEW_LINE_RANGE = (0, 3)


In [334]:
def load_text_data(path: str, encoding: str="UTF-8") -> str:
    """Load textcorpus from file into a string.
    Replace double line breaks with simple break.
    """
    text_path = Path(path)
    with text_path.open(mode='r'):
        text = text_path.read_text(encoding=encoding)
        text = text.replace("\n\n", "\n")
        return text

    
def print_some_text_stats(text: str):
    """Output some metrics for information."""
    print(f"Approx. total number of words: {len(corpus.split()):,.0f}")
    print(f"Approx. number of unique words: {len({word: None for word in corpus.split()}):,.0f}")
    lines = corpus.split('\n')
    print(f"Number of lines: {len(lines):,.0f}")
    word_count_line = [len(line.split()) for line in lines]
    print(f"Average number of words in each line: {round(np.average(word_count_line),0):,.0f}\n\n")


def print_some_sample_lines(text: str, view_line_range: Tuple[int, int]=None):
    """Output some lines for information. This step is skipped by default.
    To activate it a tuple with a view range is explicitely passed.
    """
    if view_line_range:
        print("\nThe lines {} to {}:\n".format(*VIEW_LINE_RANGE))
        print("\n\n".join(text.split("\n")[VIEW_LINE_RANGE[0]:VIEW_LINE_RANGE[1]+1]))


def clean_text(text: str, lookup: Dict[str, str]=TOKEN_LOOKUP) -> List[str]:
    """Normalize the text and convert the special characters with help of 
    a lookup dictionary, thereby ensuring that they are separated from the
    actual words. So we can properly split the text and return a list of
    all the words."""
    for special_char, token in TOKEN_LOOKUP.items():
        text = text.replace(special_char, f" {token} ")
        text = text.lower()
        # Add a special word that we will use later on
        text = "".join([text, " <PAD>"])
        return text.split()


def create_encoding_dicts(text_list: List[str]) -> Tuple[Dict[str, int], Dict[int, str]]:
    """Create two encoding dicts for the vocabulary int the text 
    (str2int, int2str). The vocabulary is sorted in descending by 
    frequency.
    """
    # Get a list of unique words sorted by frequency using Counter()
    word_counts = Counter(text_list)
    word_list_sorted = sorted(word_counts, key=word_counts.get, reverse=True)
    
    vocab_to_int = {word: pos for pos, word in enumerate(word_list_sorted)}
    int_to_vocab = {pos: word for pos, word in enumerate(word_list_sorted)}
    return vocab_to_int, int_to_vocab


def encode_text_to_int(text_list: List[str], vocab_to_int: Dict[str, int]) -> List[int]:
    """Translate the cleaned text (list) into a list of integers using
    the encoding dict. This will be the text corpus for training the NN.
    """
    return [vocab_to_int[word] for word in text_list]

def pickle_preprocessed_text_data(path: str, data_objects: Iterable[Any]):
    """Save the necessary objects for the later steps in one single
    binary pickle file.
    """
    path = Path(path)
    with open(path, "wb") as p_file: 
#         for obj in data_objects:
        pickle.dump(data_objects, p_file)

In [335]:
text = load_text_data("../data/training/text/text_corpus.txt")
print_some_text_stats(text)
print_some_sample_lines(text)
text_list =clean_text(text)
vocab_to_int, int_to_vocab = create_encoding_dicts(text_list)
int_text = encode_text_to_int(text_list, vocab_to_int)
pickle_preprocessed_text_data("preprocessed_text_data.pkl", 
                              [int_text,
                               vocab_to_int,
                               int_to_vocab,
                               TOKEN_LOOKUP
                               ]
)

Approx. total number of words: 282,312
Approx. number of unique words: 21,947
Number of lines: 1,836
Average number of words in each line: 154


