In [11]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import re

In [12]:

#The following is done to tokenize the tweets into its appropriate form
#In particular, we try to capture some emoticons, HTML tags, Twitter @usernames (@-mentions), Twitter #hashtags, 
#URLs, numbers, words with and without dashes and apostrophes

#Source : https://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [13]:
#Converting JSON format to a string containing all the tweets, which is then used to train and generate text

import json
complete_tweets = ""
text = ''
with open('euro_python.json', 'r') as f:
    for line in f:
        
        tweet = json.loads(line) # load it as Python dict
        tokens = preprocess(tweet['text'])
        
        for index,element in enumerate(tokens):
            
            #Removing '#' 
            if('#' in element):
                
                del tokens[index]
                text = text + ""
                continue
            
            
            #Removing the 'RT' tag
            elif('RT' in element):
                
                del tokens[index]
                text = text + ""
                continue
            
            #This character usually follows the 'RT' tag, so we remove it
            elif(':' in element):
                
                del tokens[index]
                text = text + ""
                continue
                
            text = text + " " + tokens[index]
        #text = text + '\n'

In [14]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 95


In [17]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 12555


In [23]:
text

' Packed room for @scott_triglia \' s talk about circuit breakers at learn about our for deployment , amp ; more https … " Here in Europe it\'s less a " hire-and-fire " mentality than in the US . " Iwan Gulenko at Rules of optimisation ) Don\'t 2 ) Don\'t . . yet 3 ) Profile of optimisation ) Don\'t 2 ) Don\'t . . yet 3 ) Profile Here in Europe it\'s less a " hire-and-fire " mentality than in the US . " Iwan Gulenko at Great slides from @deshipu . We\'re learning here how to make robots walk ! Should I optimize my code ? Do you need to optimize your code ? 1 . DON\'T , 2 . DON\'T . . . Yet , 3 . PROFILE Making robots walk " Don\'t copy big companies recruiting process . Show what * you * have . " Yes ! Iwan Gulenko at @mark_dedaj I\'m at Spain . Remember the " crappy hotel wifi " topic ? Vai ter Go na Europython . . . good and wise tips Will be Go at @europython Power consumption is a most needed part of Python code optimization gerade in der TT Deutschland angestellt worden , in der S

In [27]:
len(sentences[2])

40

In [6]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [7]:
# build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [8]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [9]:
# train the model, output generated text after each iteration
for iteration in range(1, 5):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y, batch_size=128, nb_epoch=2)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        print('\nGenerated')
        sys.stdout.write(generated)

        for i in range(20):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 1
Epoch 1/2
Epoch 2/2

----- diversity: 0.2
----- Generating with seed: " Always an adventure . We laugh , we cry"

Generated
 Always an adventure . We laugh , we cry an in in an an an a

----- diversity: 0.5
----- Generating with seed: " Always an adventure . We laugh , we cry"

Generated
 Always an adventure . We laugh , we cry apo aning ator ang 

----- diversity: 1.0
----- Generating with seed: " Always an adventure . We laugh , we cry"

Generated
 Always an adventure . We laugh , we cry ?ou ? f ovopv 🙌ks @

----- diversity: 1.2
----- Generating with seed: " Always an adventure . We laugh , we cry"

Generated
 Always an adventure . We laugh , we cry y.asd on 😊s opAdiny

--------------------------------------------------
Iteration 2
Epoch 1/2
Epoch 2/2

----- diversity: 0.2
----- Generating with seed: "e it's less a " hire-and-fire " mentalit"

Generated
e it's less a " hire-and-fire " mentality for cede the talk 

----- 