In [2]:
import pandas as pd
import nltk  
import numpy as np  

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential

from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop

import re
import random

# Trump example

Article
https://towardsdatascience.com/tweet-generation-with-neural-networks-lstm-and-gpt-2-e163bfd3fbd8

Repo
https://github.com/mm909/Predicting-Trump/blob/master/predict.py

1. Remove tweets whose lengths were less than 60 characters

2. Divide tweets into sequences of equal lengths and place them into a list called sentences

3. **Vectorization** is the next step. Here, we split the tweets into x and y evenly. x is a 3D matrix that has the shape of the total number of sentences, length of steps(40), and the number of unique characters. y is a 2D matrix that has the total length of sentences and unique characters and the purpose of this vector is to retrieve the next character that’s after a sentence given by x.

------------------


https://towardsdatascience.com/predicting-trump-tweets-with-a-rnn-95e7c398b18e

In [3]:
df = pd.read_csv('todes.csv')

## Text cleaning

Get full text from df:

In [4]:
text = ''
for tweet in df.full_text:
    text += tweet
    
text = text.lower()
text = text.replace('\n','')

Define cleaning functions:

In [11]:
def removeLinks(text):
    return re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)

def removeEmojis(text):
    return text.encode('ascii', 'ignore').decode('ascii')

def removeEllipsis(text):
    return re.sub('\.\.[\.]*', " ", text)

def removeParens(text):
    return re.sub("[\(\[].*?[\)\]]", "", text)

def removeLF(text):
    text = re.sub('\n',' ', text)
    text = re.sub(' [ ]*', ' ', text)
    return text

In [12]:
text = removeLinks(text)
text = removeEllipsis(text)
text = removeEmojis(text)
text = removeParens(text)
text = removeLF(text)

In [7]:
len(text)

207166

## Character Dictionaries

In [8]:
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print('Unique Chars:', len(chars))

Unique Chars: 58


## Hyperparameters

In [56]:
sequence_length = 80
step_size = 4

## Encoding

In [57]:
sentences = []
next_chars = []
for i in range(0, len(text) - sequence_length, step_size):
    sentences.append(text[i: i + sequence_length])
    next_chars.append(text[i + sequence_length])

## Secuences hot encoding

In [58]:

X = np.zeros((len(sentences), sequence_length, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [59]:
print('X.shape:', X.shape)

print('Y.shape:', y.shape)

X.shape: (51772, 80, 58)
Y.shape: (51772, 58)


## Model

In [71]:


model = Sequential()

model.add(LSTM(len(chars) * 5, input_shape=(sequence_length, len(chars))))
model.add(BatchNormalization())
model.add(Activation('selu'))

model.add(Dense(len(chars) * 2))
model.add(BatchNormalization())
model.add(Activation('selu'))

model.add(Dense(len(chars) * 2))
model.add(BatchNormalization())
model.add(Activation('selu'))

model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


In [72]:
model.fit(X, y, validation_split=0.05, batch_size=124, epochs=4, shuffle=True)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x1a3f258310>

### From the Datacamp example

In [83]:


def generate_text(sentence, n):
    generated = sentence
#     generated += sentence
    for i in range(n):
    # Create a 3-D zero vector to contain the encoding of sentence.
        maxlen =40
        x_pred = np.zeros((1, sequence_length, len(chars)))
        
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.
            
        # Get probability distribution for the next character
        preds = model.predict(x_pred, verbose=0)[0]
        #print(preds)
    
        # Get the character with maximum probability
        next_index = np.argmax(preds)
        #next_index = random.sample(list(preds),0.7)
        next_index = random.choice(sorted(enumerate(preds),
                       key=lambda x: x[1]
                       )[-15:])[0]
# write an if statement to check if most recent character generated is a space, if it is then generate randomly next character
# from -15 to -1 (from some array without array)

        next_char = indices_char[next_index]
    
        # Append the new character to the next input and generated text
        sentence = sentence[1:] + next_char
        generated += next_char
    
    # Print the generated text
    print(generated)


In [84]:
# Input sequence and generate text
sentence = "esa gente debe parar"
generate_text(sentence, 20)

esa gente debe parary# ad#dctnsils tcdto
