# Language Generation
Generating sentences with conditional probabilities

Author: Pierre Nugues, modified by Marcus Klang

In [1]:
import os
import sys
import numpy as np

In [2]:
sys.path.append(os.path.join(os.environ["SPARK_HOME"], "python"))
sys.path.append(os.path.join(os.environ["SPARK_HOME"], "python", "lib", "py4j-0.10.7-src.zip"))

In [3]:
import pyspark

In [4]:
config = (pyspark.SparkConf()
                .setAll([('spark.executor.memory', '8g'), 
                         ('spark.driver.memory','8g')]))

sc = pyspark.SparkContext(conf=config)

In [6]:
ENWIKI_FULL="file:/usr/local/cs/EDAN95/datasets/wikipedia/enwiki"

## Reading a Corpus
Utility function to read all the files in a folder

In [7]:
#import os
#def get_files(dir, suffix):
#    """
#    Returns all the files in a folder ending with suffix
#    :param dir:
#    :param suffix:
#    :return: the list of file names
#    """
#    files = []
#    for file in os.listdir(dir):
#        if file.endswith(suffix):
#            files.append(file)
#    return files

## Tokenizer
An elemetary tokenizer

In [8]:
import regex as re

In [9]:
#def tokenize(text):
#    """
#    Uses the letters to break the text into words.
#    Returns a list of match objects
#    """
#    words = re.findall('\p{L}+', text)
#    return words

## Reading the Files
We read a corpus of novels from Dickens

In [10]:
#folder = '/Users/pierre/Documents/Cours/EDAN20/corpus/Selma/'
#folder = '/Users/pierre/Documents/Cours/EDAN20/corpus/Dickens/'
#files = get_files(folder, 'txt')
#files

We tokenize the texts

In [11]:
#words = []
#for file in files:
#    text = open(folder + file).read().lower().strip()
#    words += tokenize(text)
#words[:10]

In [12]:
wiki = sc.textFile(ENWIKI)

Tokenize and preprocess the text

In [13]:
m = re.compile("\p{L}+")
word_bags = (
    wiki
    .map(str.lower)
    .map(lambda ln: [tok for tok in ln.split() if m.fullmatch(tok) is not None])
    .filter(lambda bag: len(bag) > 0)
)

## N-gram functions

Cutoff setting for reducing final memory requirements for bigrams

In [20]:
CUTOFF=5

Create word mapping for memory and performance

In [16]:
word2id = word_bags.flatMap(lambda x: x).distinct().sortBy(lambda x:x).zipWithIndex().collectAsMap()

In [122]:
word2id["lund"]

2785336

In [17]:
id2word = {v:k for k,v in word2id.items()}

Share it for parallel use

In [18]:
word2id_map = sc.broadcast(word2id)

In [15]:
#def count_unigrams(words):
#    frequency = {}
#    for word in words:
#        if word in frequency:
#            frequency[word] += 1
#       else:
#           frequency[word] = 1
#    return frequency

In [19]:
from operator import add
unigram_freq = (
    word_bags
    .flatMap(lambda x: [(word2id_map.value[w],1) for w in x])
    .reduceByKey(add)
)

In [None]:
#def count_bigrams(words):
#    bigrams = [tuple(words[idx:idx + 2])
#               for idx in range(len(words) - 1)]
#    frequencies = {}
#    for bigram in bigrams:
#        if bigram in frequencies:
#            frequencies[bigram] += 1
#        else:
#           frequencies[bigram] = 1
#    return frequencies

In [21]:
bigram_freq = (
    word_bags
    .flatMap(lambda words: [ (tuple(map(word2id_map.value.__getitem__, words[idx:idx + 2])),1) for idx in range(len(words) - 1)])
    .reduceByKey(add)
    .filter(lambda tup: tup[1] > CUTOFF) # Huge result otherwise!
)

### We count the unigrams and bigrams

In [22]:
unigrams = unigram_freq.collectAsMap() #count_unigrams(words)

In [23]:
unigrams[word2id['master']]

244250

In [24]:
#bigrams = bigram_freq.collectAsMap() # count_bigrams(words)

### Given a bigram, $w_n, w_{n+1}$, we compute $P(w_{n+1}|w_n)$. This is defined as $\frac{count(w_n, w_{n+1})}{count(w_n)}$.

In [26]:
unigrams_bc = sc.broadcast(unigrams)

In [29]:
from pyspark.storagelevel import StorageLevel

In [31]:
def map_conditionals(items):
    word2prob = sorted(items, key=lambda tup: tup[1], reverse=True)
    prs = np.array([pr for w,pr in word2prob], dtype=np.float32)
    prs /= np.sum(prs) # Rebalance so that the sum always equals to 1, due to cutoff.
    return np.array([w for w,pr in word2prob]), prs

cond_prop_sorted_rdd = (
    bigram_freq
    .map(lambda tup: (tup[0][0], (tup[0][1], tup[1]/unigrams_bc.value[tup[0][0]])))
    .groupByKey()
    .mapValues(map_conditionals)
)

In [32]:
cond_prop_sorted = cond_prop_sorted_rdd.collectAsMap()

In [33]:
cond_prop_sorted[word2id["lund"]]

(array([5005660,  178902, 2116433, 5202899, 4632207, 4773250, 2170440,
         770169,  593128, 1922322, 5240452, 4831101,       0, 1905812,
        3458113,  276751, 2039194, 1862630,  139372, 1598748, 3170331,
        5237264, 4627294,  446772, 3429824, 2183913, 2828566, 5262971,
        4327786, 1567952,  298426, 2428972, 1176781, 5237650, 2149877,
         688606,   66848, 2825216,  682422, 5227302, 1557332, 2944551,
        4424331, 2143616, 2785336,  948000, 2259817, 5274714, 3416079,
         171036, 3323165, 4297293, 4846096, 5013391, 4378498, 1944930,
        4527463, 2653323,  452205, 3723253, 4789557, 3982844, 1891472,
        1980611, 4777335, 2674011, 5017911, 1502161, 4391587, 4150697,
        4547992, 2022398, 5282036, 3140212, 1283118, 1483864,  451911,
        3780699,  169634, 4780067, 4290262, 5203684, 4783615, 5237171,
        2866739,  229420, 1175457,  248090,  883098, 3584278, 4568794,
        3953278, 2638357, 2276846, 1260653, 1258130,  770661,  226276,
      

In [34]:
word, probs = cond_prop_sorted[word2id["master"]]
for w, p in list(zip(word, probs))[0:50]:
    print('master', "{0: <15} = {1}".format(id2word[w], p))

master of              = 0.27679145336151123
master degree          = 0.14825420081615448
master and             = 0.04531529173254967
master in              = 0.03585680574178696
master plan            = 0.025811992585659027
master degrees         = 0.02226676233112812
master the             = 0.013803665526211262
master at              = 0.012767367996275425
master classes         = 0.011135654523968697
master was             = 0.008794893510639668
master is              = 0.0068586538545787334
master thesis          = 0.006463224533945322
master to              = 0.006272327620536089
master he              = 0.0056860013864934444
master who             = 0.0056632752530276775
master builder         = 0.005586007609963417
master system          = 0.005331478547304869
master sergeant        = 0.005290571600198746
master class           = 0.004849690478295088
master for             = 0.004822419956326485
master chief           = 0.004676974844187498
master tapes           = 0.004454261

In [35]:
#probs = {k: v/unigrams[k[0]] for k, v in bigrams.items()}

In [36]:
#cond_prob = sorted([(k, v) for k, v in probs.items() if k[0] == 'master'],
#                    key=lambda tup: tup[1], reverse=True)
#cond_prob

### Drawing samples from using a distribution. Understanding the `np.random.choice` function

In [124]:
import numpy as np
np.random.seed(0)
outputs = np.random.choice(np.arange(0,3), size=10000, p=[0.3, 0.5, 0.2])
print("0", np.sum(outputs == 0))
print("1", np.sum(outputs == 1))
print("2", np.sum(outputs == 2))

0 3055
1 4973
2 1972


### Extracting the conditional probabilities of a word

In [38]:
#def cond_prob(word):
#    cprob = sorted([(k, v) for k, v in probs.items() if k[0] == word],
#                    key=lambda tup: tup[1], reverse=True)
#    return cprob
#cond_prob('master')

## Save all important parts

In [102]:
import pickle

In [105]:
with open("lm.pkl", "wb") as fout:
    pickle.dump({
        "word2id": word2id,
        "id2word": id2word,
        "cond_prop_sorted": cond_prop_sorted
    }, fout)

### And finally, generating a sequence

In [77]:
def temperature(p, temperature):
    preds = np.log(p) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return preds

In [132]:
TEMPERATURE=1.0
word = word2id['skåne']
print(id2word[word], end=' ')
for i in range(100):
    if word not in cond_prop_sorted:
        word = word2id["the"] # For bigrams for which the second words does not have a continuation (too rare)
        
    words, distribution = cond_prop_sorted[word]
    #distribution = [i[1] for i in cprob]
    word = np.random.choice(words, size=1, p=temperature(distribution, TEMPERATURE))[0]
    print(id2word[word], end=' ')

skåne county pennsylvania public schools feed on april was not in american numbering was founded by a programmer who was added to date from wjla began writing development and eighteen more to switch to a four public library in the inspiring moments were as part of land in the top south coast of india he was generally use the village is the southwest of the annual gross came from to land bonded to a mechanic lien is a portion of parma which occurred only instance if the ancestor the same test scores in february his narrow gauge track and blow at 