In [None]:
import re
import collections
import nltk.tokenize
import numpy
import pandas
import pickle
import seaborn
import matplotlib.pyplot as plt

In [None]:
# based on Tokeniser demo code from LearnIT
# line = 'A cat sat on the mat. His name was Måns.'
def tokenize(line):
    # Initialise lists
    tokens = []
    # unmatchable = []

    # Compile patterns for speedup
    token_pat = re.compile(r'\w+')
    skippable_pat = re.compile(r'\s+')  # typically spaces

    # As long as there's any material left...
    while line:
        # Try finding a skippable token delimiter first.
        skippable_match = re.search(skippable_pat, line)
        if skippable_match and skippable_match.start() == 0:
            # If there is one at the beginning of the line, just skip it.
            line = line[skippable_match.end():]
        else:
            # Else try finding a real token.
            token_match = re.search(token_pat, line)
            if token_match and token_match.start() == 0:
                # If there is one at the beginning of the line, casefold and tokenise it.
                tokens.append(line[:token_match.end()].casefold())
                line = line[token_match.end():]
            else:
                # Else there is unmatchable material here.
                # It ends where a skippable or token match starts, or at the end of the line.
                unmatchable_end = len(line)
                if skippable_match:
                    unmatchable_end = skippable_match.start()
                if token_match:
                    unmatchable_end = min(unmatchable_end, token_match.start())
                # Add it to unmatchable and discard from line.
                # unmatchable.append(line[:unmatchable_end])
                line = line[unmatchable_end:]
    
    return tokens

In [None]:
# based on Frequency plots demo code from LearnIT
def frequency(corpus):
    # tok = nltk.tokenize.TreebankWordTokenizer()
    #
    # corpus = []
    # with open('news-commentary-v16.en', 'r') as f:
    #     for line in f:
    #         corpus.extend(t for line in f for t in tok.tokenize(line))
    #
    # with open('ncv16-list.pkl', 'wb') as f:
    #     pickle.dump(corpus, f)

    with open('ncv16-list.pkl', 'rb') as f:
        corpus = pickle.load(f)

    voc = collections.Counter(corpus)
    frq = pandas.DataFrame(voc.most_common(), columns=['token', 'frequency'])

    # Index in the sorted list
    frq['idx'] = frq.index + 1

    # Frequency normalised by corpus size
    frq['norm_freq'] = frq.frequency / len(corpus)

    # Cumulative normalised frequency
    frq['cumul_frq'] = frq.norm_freq.cumsum()

    seaborn.set_theme(style='whitegrid')

    # Plot: Cumulative frequency by index
    seaborn.relplot(x='idx', y='cumul_frq', data=frq)
    plt.show()

    # Plot: Cumulative frequency by index, top 10000 tokens
    seaborn.relplot(x='idx', y='cumul_frq', data=frq[:10000], kind='line')
    plt.show()

    # Plot: Log-log plot for Zipf's law
    frq['log_frq'] = numpy.log(frq.frequency)
    frq['log_rank'] = numpy.log(frq.frequency.rank(ascending=False))
    seaborn.relplot(x='log_rank', y='log_frq', data=frq)
    plt.show()