In [7]:
import numpy as np
import keras

In [8]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Activation
from keras.optimizers import RMSprop
import random
import sys
import re
import json

In [9]:
def file_to_string(filename):
    data = ''
    
    with open(filename, 'r') as file:
        data = file.read()
        
    return data

In [10]:
def split_into_sentences(data):
    sentences = re.split("(?<=}),", data)
    del sentences[-1] # remove empty string at the end
    return sentences

In [11]:
def convert_to_json(data):
    jsonified = []
    
    for i in data:
        a = json.loads(i)
        jsonified.append(a)
        
    return jsonified

In [12]:
def filter_titles(data,score=1):
    high_score_titles = []
    
    for i in data:
        if i["score"] >= score:
            high_score_titles.append(i["title"])
            
    return high_score_titles

In [13]:
def find_longest_shortest_title(data):
    minlen = min(data, key=len)
    maxlen = max(data, key=len)
    
    return len(maxlen), len(minlen)

In [14]:
def find_avg_title_length(corpus, array):
    
    return int(len(corpus)/len(array))

In [15]:
def get_final_corpus(data):
    corpus = ''
    
    for i in data:
        corpus += i
    
    return corpus

In [16]:
def get_unique_chars(data):
    return sorted(list(set(data)))

In [17]:
def clean_corpus(exp, data):
    
    return re.sub(exp,' ',data)

In [18]:
def create_char_indices(chars):
    
    char_indices = dict((c,i) for i,c in enumerate(chars))
    indices_char = dict((i,c) for i,c in enumerate(chars))
    
    return char_indices, indices_char

In [19]:
def split_into_chunks(array, maxlen, step, exp):
    
    sentences = []
    next_chars = []
    
    for i in array:
        i = re.sub(exp, '', i)
        for t in range(0, len(i)-maxlen, step):
            sentences.append(i[t:t+maxlen])
            next_chars.append(i[t+maxlen])
            
    return sentences, next_chars

In [20]:
def vectorization(sentences, maxlen, chars, char_indices, next_chars):
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i,t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
        
    return X, y

In [21]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [22]:
file_as_string = file_to_string('lifeprotips.txt')

In [23]:
print("Num of chars:",len(file_as_string))

Num of chars: 2504497


In [24]:
raw_titles = split_into_sentences(file_as_string)

In [25]:
print("Number of titles: ",len(raw_titles))

Number of titles:  20218


In [26]:
json_sentences = convert_to_json(raw_titles)

In [27]:
print("Num of json objects:", len(json_sentences))

Num of json objects: 20218


In [28]:
score = 1
valid_titles = filter_titles(json_sentences, score=score)

In [29]:
print("Titles above score {}: {}".format(score, len(valid_titles)))

Titles above score 1: 16276


In [30]:
longest_title, shortest_title = find_longest_shortest_title(valid_titles)

In [31]:
print("Longest title:", longest_title)
print("Shortest title:", shortest_title)

Longest title: 300
Shortest title: 12


In [32]:
final_corpus = get_final_corpus(valid_titles)

In [33]:
print("Length of final corpus:",len(final_corpus))

Length of final corpus: 1593005


In [34]:
avg_title_length = find_avg_title_length(final_corpus, valid_titles)

In [35]:
print("Average Title Length:", avg_title_length)

Average Title Length: 97


In [36]:
allchars = get_unique_chars(final_corpus)

In [37]:
print("All chars", allchars)

All chars [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x7f', '£', '°', '´', '½', 'ã', 'è', 'é', 'í', 'ñ', 'Δ', '\u200b', '\u200e', '–', '—', '‘', '’', '“', '”', '…', '∗', '⌘', '✓', '\ufeff', '🎶']


In [38]:
print("Length of all chars:", len(allchars))

Length of all chars: 120


In [39]:
exp = '[\u200b|\u200e|\ufeff|\x7f]'
cleaned_corpus = clean_corpus(exp, final_corpus)

In [40]:
print("Length of cleaned corpus:", len(cleaned_corpus))

Length of cleaned corpus: 1593005


In [41]:
chars = get_unique_chars(cleaned_corpus)

In [42]:
print("Length of Cleaned chars:", len(chars))

Length of Cleaned chars: 115


In [43]:
char_indices, indices_char = create_char_indices(chars)

In [44]:
maxlen = 20
step = 3

sentences, next_chars = split_into_chunks(valid_titles, maxlen, step, exp)

In [39]:
print("Number of sentence chunks:", len(sentences))

Number of sentence chunks: 427895


In [40]:
print("Number of next chars", len(next_chars))

Number of next chars 427895


In [41]:
X, y = vectorization(sentences, maxlen, chars, char_indices, next_chars)

In [42]:
print("Shape of training inputs:", X.shape)

Shape of training inputs: (427895, 20, 115)


In [43]:
print("Shape of training labels", y.shape)

Shape of training labels (427895, 115)


In [1]:
import keras
from keras.models import load_model

Using TensorFlow backend.


In [2]:
predict_model = load_model('lifeprotips_model.h5')

In [3]:
def predict(model, seed, maxlen, length, char_indices, indices_char):
    for temp in [0.5, 1.0]:
        print()
        print("----Temperature", temp)

        generated = ''
        sentence = seed
        generated+=sentence
        print("Using seed: ", seed)
        sys.stdout.write(generated)

        for i in range(length):
            x = np.zeros((1, maxlen, len(chars)))
            for t,char in enumerate(sentence):
                x[0,t, char_indices[char]]=1

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, temp)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:]+next_char
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


----Temperature 0.5
Using seed:  LPT: if you have to 
LPT: if you have to take a scenting in the something you w



ill cards with it with a line before you preparine shower a date the paper on your

----Temperature 1.0
Using seed:  LPT: if you have to 
LPT: if you have to have to plicad lid of the best pead open, small use your eastling out and ask you teesed of friend so if anal splaids of


In [80]:
def build_model(maxlen, chars):
    model = Sequential()

    model.add(LSTM(128, input_shape=(maxlen, len(chars)), return_sequences=True))
    model.add(LSTM(128))
    model.add(Dense(len(chars)))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy',
                 optimizer=optimizer)
    return model

In [86]:
def train(model, X, y, seed, maxlen, length, ep):
    for i in range(1,ep):
        print()
        print('-'*50)
        print("Iteration",i)
        model.fit(X, y, batch_size=128, epochs=1)

        for temp in [0.2, 0.5, 1.0, 1.2]:
            print()
            print("----Temperature", temp)

            generated = ''
            sentence = seed
            generated+=sentence
            print("Using seed: ", seed)
            sys.stdout.write(generated)

            for i in range(length):
                x = np.zeros(1, maxlen, len(chars))
                for t,char in enumerate(sentence):
                    x[0,t, char_indices[char]]=1

                preds = model.predict(x, verbose=0)[0]
                next_index = sample(preds, temp)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:]+next_char
                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()
        filename = 'lifeprotips_'+i+'.h5'
        model.save_weights(filename)
            

In [81]:
model = build_model(maxlen, chars)

In [85]:
seed = 'LPT:'
length = avg_title_length
ep = 60

In [None]:
train(model, X, y, seed, maxlen, length, ep)