In [1]:
!pip install bs4
import scipy.sparse
from sklearn.preprocessing import normalize
import numpy as np
import re
import warnings
import nltk
import pickle
from string import ascii_lowercase, digits
from bs4 import BeautifulSoup, NavigableString
from collections import Counter

class Text:
    def __init__(self, input_text, token2ind=None, ind2token=None):
        self.content = input_text
        self.tokens, self.tokens_distinct = self.tokenize()

        if token2ind != None and ind2token != None:
            self.token2ind, self.ind2token = token2ind, ind2token
        else:
            self.token2ind, self.ind2token = self.create_word_mapping(self.tokens_distinct)

        self.tokens_ind = [self.token2ind[token] if token in self.token2ind.keys() else self.token2ind['<| unknown |>']
                           for token in self.tokens]

    def __repr__(self):
        return self.content

    def __len__(self):
        return len(self.tokens_distinct)

    @staticmethod
    def create_word_mapping(values_list):
        values_list.append('<| unknown |>')
        value2ind = {value: ind for ind, value in enumerate(values_list)}
        ind2value = dict(enumerate(values_list))
        return value2ind, ind2value

    def preprocess(self):
        punctuation_pad = '!?.,:-;'
        punctuation_remove = '"()_\n'

        self.content_preprocess = re.sub(r'(\S)(\n)(\S)', r'\1 \2 \3', self.content)
        self.content_preprocess = self.content_preprocess.translate(str.maketrans('', '', punctuation_remove))
        self.content_preprocess = self.content_preprocess.translate(
            str.maketrans({key: ' {0} '.format(key) for key in punctuation_pad}))
        self.content_preprocess = re.sub(' +', ' ', self.content_preprocess)
        self.content = self.content_preprocess.strip()
        
    def tokenize(self):
        self.preprocess()
        tokens = self.content.split(' ')
        return tokens, list(set(tokens))

    def tokens_info(self):
        print('total tokens: %d, distinct tokens: %d' % (len(self.tokens), len(self.tokens_distinct)))
        
class Chain:
    def __init__(self, text_object, n=2):
        self.text_object = text_object
        self.n = n

        self.tokens, self.tokens_distinct = text_object.tokens, text_object.tokens_distinct
        self.ngrams, self.ngrams_distinct = self.create_ngrams()
        self.token2ind, self.ind2token = text_object.token2ind, text_object.ind2token
        self.ngram2ind, self.ind2ngram = text_object.create_word_mapping(self.ngrams_distinct)
        self.transition_matrix_prob = self.create_transition_matrix_prob()

    def create_ngrams(self):
        sequences = [self.tokens[i:] for i in range(self.n)]
        ngrams = [' '.join(ngram) for ngram in list(zip(*sequences))]
        return ngrams, list(set(ngrams))

    def tokens_info(self):
        self.text_object.tokens_info()

    def ngrams_info(self):
        print('ngrams level: %d, total ngrams: %d, distinct ngrams: %d' % (
        self.n, len(self.ngrams), len(self.ngrams_distinct)))

    def random_ngram(self):
        return np.random.choice(self.ngrams)

    def create_transition_matrix(self):
        row_ind, col_ind, values = [], [], []

        for i in range(len(self.tokens[:-self.n])):
            ngram = ' '.join(self.tokens[i:i + self.n])
            ngram_ind = self.ngram2ind[ngram]
            next_word_ind = self.token2ind[self.tokens[i + self.n]]

            row_ind.extend([ngram_ind])
            col_ind.extend([next_word_ind])
            values.extend([1])

        S = scipy.sparse.coo_matrix((values, (row_ind, col_ind)), shape=(len(self.ngram2ind), len(self.token2ind)))
        return S

    def create_transition_matrix_prob(self):
        transition_matrix = self.create_transition_matrix()
        return normalize(transition_matrix, norm='l1', axis=1)

    def check_prefix(self, prefix):
        prefix_list = prefix.split(' ')[-self.n:]
        if len(prefix_list) < self.n:
            warnings.warn(
                'Prefix is too short, please provide prefix of length: %d. Random ngram used instead.' % self.n)
            return self.random_ngram()
        else:
            prefix = ' '.join(prefix_list)
            if prefix in self.ngrams:
                return prefix
            else:
                warnings.warn(
                    'Prefix is not included in ngrams of the model. Provide another prefix. Random ngram used instead.')
                return self.random_ngram()

    @staticmethod
    def add_weights_temperature(input_weights, temperature):
        weights = np.where(input_weights == 0, 0, np.log(input_weights + 1e-10)) / temperature
        weights = np.exp(weights)
        return weights / np.sum(weights)

    @staticmethod
    def reverse_preprocess(text):
        text_reverse = re.sub(r'\s+([!?"\'().,;-])', r'\1', text)
        text_reverse = re.sub(' +', ' ', text_reverse)
        return text_reverse

    def return_next_word(self, prefix, temperature=1):
        prefix = self.check_prefix(prefix)
        prefix_ind = self.ngram2ind[prefix]
        weights = self.transition_matrix_prob[prefix_ind].toarray()[0]
        if temperature != 1:
            weights = self.add_weights_temperature(weights, temperature)

        token_ind = np.random.choice(range(len(weights)), p=weights)
        next_word = self.ind2token[token_ind]
        return next_word

    def generate_sequence(self, prefix, k, temperature=1):
        prefix = self.check_prefix(prefix)
        sequence = prefix.split(' ')

        for i in range(k):
            next_word = self.return_next_word(prefix, temperature=temperature)
            sequence.append(next_word)
            prefix = ' '.join(sequence[-self.n:])

        return self.reverse_preprocess(' '.join(sequence))

    def bulk_generate_sequence(self, prefix, k, samples, temperature=1):
        for i in range(samples):
            print(self.generate_sequence(prefix, k, temperature=temperature))
            print('\n')


# Loading and saving files

def read_txt(path):
    return open(path, 'r', encoding='utf-8').read()

def save_txt(text, path):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(text)

def load_pickle(path):
    with open(path, 'rb') as handle:
        return pickle.load(handle)

def save_pickle(variable, path):
    with open(path, 'wb') as handle:
        pickle.dump(variable, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Web scraper functions

def exclude_black_list(content, black_list):
    return False if content.lower() in black_list else True

def between(start, end, exclude=[]):
    while start != end:
        if isinstance(start, NavigableString):
            yield start
        elif start.name in exclude:
            start = start.next_element
        start = start.next_element

def format_file_name(title, titles):
    allowed_letters = ascii_lowercase + digits + '_'

    title = title.split('\n')
    title = title[0] if len(title[0]) > 0 else title[1]
    title = title.strip().replace(' ', '_').lower()
    title = ''.join([letter for letter in title if letter in allowed_letters])
    titles.append(title)

    if title in titles:
        title = title + str(titles.count(title))

    return title + '.txt', titles

def format_text(start, end, exclude, num_words):
    text = ' '.join(t for t in between(start, end, exclude))
    text = '\n'.join(text.split("\n")[1:]).strip()
    num_words.append(len(text.split(' ')))
    return text, num_words

def words_summary(num_words):
    print('Number of unique files with fairy tales:', len(num_words))
    print('Total number of words in all fairy tales:', sum(num_words))
    print('Average number of words in a fairy tale: %d' % (sum(num_words)/len(num_words)))
    print('Number of words in the shortest story: %d, in the longest story: %d' % (min(num_words), max(num_words)))

def text_summary(text, exclude_words=[]):
    text = text.replace('\n', ' ').strip().split(' ')
    words_counter = Counter(text).most_common()
    unique_words = len(words_counter) - len(exclude_words)
    total_words = sum([occ for word, occ in words_counter if word not in exclude_words])

    print('Number of unique words:', unique_words)
    print('Total number of words:', total_words)

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 843 kB/s 
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.2.1-py3-none-any.whl (33 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25l- \ done
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1273 sha256=705f2cf4ccfcda6c1183fb851bf130d0feb5d05243b0daacc129f507f75598e8
  Stored in directory: /root/.cache/pip/wheels/0a/9e/ba/20e5bbc1afef3a491f0b3bb74d508f99403aabe76eda2167ca
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.9.3 bs4-0.0.1 soupsieve-2.2.1


In [2]:
import pandas as pd
df = pd.read_csv('../input/eluvio-dataset/Eluvio_DS_Challenge.csv')
#df = df.join(df.groupby('date_created')['up_votes'].mean(), on='date_created' ,rsuffix='_mean_date')
#df = df.join(df.groupby('date_created')['down_votes'].mean(), on='date_created', rsuffix='_mean_date')#
#df = df.join(df.groupby('author')['up_votes'].mean(), on='author', rsuffix='_mean_author')
#df = df.join(df.groupby('author')['down_votes'].mean(), on='author', rsuffix = '_mean_author')
df['title'].to_csv(r'title.txt', header=None, index=None, sep='\n', mode='a')
print('text file created')
path = 'title.txt'
input_text = read_txt(path)
tales_text = Text(input_text)
print('training chain models')
chain_model_n3 = Chain(tales_text, n=3)
print('model trained')
chain_model_n3.tokens_info()
chain_model_n3.ngrams_info()

text file created
training chain models
model trained
total tokens: 8102361, distinct tokens: 144261
ngrams level: 3, total ngrams: 8102359, distinct ngrams: 5770935


In [3]:
prefixes = ['the young man', 'Once upon a', 'Time passed ,']
temperatures = [1]
for temperature in temperatures:
    print('temperature:', temperature)
    print(chain_model_n3.generate_sequence(np.random.choice(prefixes), 5, temperature=temperature))
    print('\n')

temperature: 1
Once upon a time La Dolce Vita Pics




In [4]:
chain_model_n5 = Chain(tales_text, n=5)
chain_model_n5.tokens_info()
chain_model_n5.ngrams_info()

prefixes_n5 = ['the rich men of the', 'Where are you going ?', 'Once upon a time there']
for temperature in temperatures:
    print('temperature:', temperature)
    print(chain_model_n5.generate_sequence(np.random.choice(prefixes_n5), 8, temperature=temperature))
    print('\n')

total tokens: 8102361, distinct tokens: 144261
ngrams level: 5, total ngrams: 8102357, distinct ngrams: 7756811
temperature: 1




Kong Free Press Exposure to bright light can lead to greater sexual satisfaction




In [5]:
chain_model_n1 = Chain(tales_text, n=1)
chain_model_n1.tokens_info()
chain_model_n1.ngrams_info()

prefixes_n1 = ['Kuwait ', 'Spain', 'Moscow ']
for temperature in temperatures:
    print('temperature:', temperature)
    print(chain_model_n1.generate_sequence(np.random.choice(prefixes_n1), 15, temperature=temperature))
    print('\n')

total tokens: 8102361, distinct tokens: 144261
ngrams level: 1, total ngrams: 8102361, distinct ngrams: 144261
temperature: 1
Spain passes anti- Qaeda Terrorists Was Typhoon Hagupit weakens, including the state sovereignty Erdogan




In [6]:
chain_model_n7 = Chain(tales_text, n=7)
chain_model_n7.tokens_info()
chain_model_n7.ngrams_info()

prefixes_n1 = ['Moscow ']
for temperature in temperatures:
    print('temperature:', temperature)
    print(chain_model_n7.generate_sequence(np.random.choice(prefixes_n1), 9, temperature=temperature))
    print('\n')

total tokens: 8102361, distinct tokens: 144261
ngrams level: 7, total ngrams: 8102355, distinct ngrams: 7990827
temperature: 1




film that is changing Kurdistan Merkel s sharp call to Obama after German intelligence produces plausible




In [7]:
import time
import os
from joblib import dump, load
pickle_file = 'chain_model_n1.joblib'
start = time.time()
with open(pickle_file, 'wb') as f:
    dump(chain_model_n1, f, compress='zlib')
zlib_dump_duration = time.time() - start
print("Zlib dump duration: %0.3fs" % zlib_dump_duration)
zlib_file_size = os.stat(pickle_file).st_size / 1e6
print("Zlib file size: %0.3fMB" % zlib_file_size)

Zlib dump duration: 134.607s
Zlib file size: 109.649MB


In [8]:
pickle_file = 'chain_model_n3.joblib'
start = time.time()
with open(pickle_file, 'wb') as f:
    dump(chain_model_n3, f, compress='zlib')
zlib_dump_duration = time.time() - start
print("Zlib dump duration: %0.3fs" % zlib_dump_duration)
zlib_file_size = os.stat(pickle_file).st_size / 1e6
print("Zlib file size: %0.3fMB" % zlib_file_size)

Zlib dump duration: 308.770s
Zlib file size: 280.487MB


In [9]:
pickle_file = 'chain_model_n5.joblib'
start = time.time()
with open(pickle_file, 'wb') as f:
    dump(chain_model_n5, f, compress='zlib')
zlib_dump_duration = time.time() - start
print("Zlib dump duration: %0.3fs" % zlib_dump_duration)
zlib_file_size = os.stat(pickle_file).st_size / 1e6
print("Zlib file size: %0.3fMB" % zlib_file_size)

Zlib dump duration: 362.765s
Zlib file size: 329.151MB


In [10]:
pickle_file = 'chain_model_n7.joblib'
start = time.time()
with open(pickle_file, 'wb') as f:
    dump(chain_model_n7, f, compress='zlib')
zlib_dump_duration = time.time() - start
print("Zlib dump duration: %0.3fs" % zlib_dump_duration)
zlib_file_size = os.stat(pickle_file).st_size / 1e6
print("Zlib file size: %0.3fMB" % zlib_file_size)

Zlib dump duration: 369.668s
Zlib file size: 342.839MB
