## References
- [analyticsvidhya.com/blog](https://www.analyticsvidhya.com/blog/2020/08/build-a-natural-language-generation-nlg-system-using-pytorch/)
- [drive.google.com ... plots_text.pickle](https://drive.google.com/file/d/1PakdWMKYNyC5-2G_CSlLtkBsHezFpMHJ/view)

In [1]:
import pickle

from nessvec.constants import DATA_DIR

with (DATA_DIR / "corpora" / "cmu_movie_plots.pickle").open('rb') as fin:
    movie_plots = pickle.load(fin)

len(movie_plots)

500

In [2]:
import pandas as pd
import numpy as np
np.random.seed(451)

df = pd.DataFrame(movie_plots, columns=['text'])
df.head()

Unnamed: 0,text
0,barry is a private with the 101st airborne div...
1,chinese exorcist one-eyebrow priest leads a p...
2,while playing baseball on a busy street in gre...
3,thadeous and fabious ([[danny mcbride are son...
4,"{{plot}} jung su-ji is a quiet, mysterious bea..."


In [3]:
import re


def tokenize(text, pattern=r'\w+(?:\'\w+)?|[^\w\s]'):
    r""" Split English text into words, ignoring 1 internal punctuation"

    `return list(re.findall(r'\w+(?:\'\w+)?|[^\w\s]', text))`
    """
    return list(re.findall(pattern, text))


def create_seq(text, seq_len=5):
    sequences = []
    tokens = tokenize(text)
    if len(tokens) > seq_len:
        for i in range(seq_len, len(tokens)):
            ngram = tokens[i - seq_len:i + 1]
            sequences.append(" ".join(ngram))  
    else:
        sequences = [" ".join(tokens)]
    return sequences

In [4]:
movie_plots[-1][:100] + '...'


'a village in rural thailand is celebrating loy krathong, when the festivities are disrupted by the d...'

In [5]:
from itertools import chain

vocab = sorted(set(chain(*(set(tokenize(s)) for s in movie_plots))))
int2token = pd.Series(vocab)
token2int = pd.Series(int2token.index.values, index=int2token.values)
int2token.sample(5)

12394           rope
3069     concurrence
14150        suppose
4738         elegant
3107        conflict
dtype: object

In [6]:
five_grams = create_seq(movie_plots[-1])
five_grams[:3]

['a village in rural thailand is',
 'village in rural thailand is celebrating',
 'in rural thailand is celebrating loy']