In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
print(np.__version__, tf.__version__, keras.__version__)

1.18.5 2.5.0 2.5.0


In [3]:
import re

def clean_tweet(tweet: str) -> str:
    # take out URLs
    tweet = re.sub("http.*(\s|$)", ' ', tweet)
    # take out mentions
    tweet = re.sub("@[^\s]+", ' ', tweet)
    # take out hashtags
    tweet = re.sub("#[^\s]+", ' ', tweet)
    # take out all characters outside of those we enumerate
    tweet = re.sub("[^\da-zA-Z√°√©√≠√≥√∫√º√±√Å√â√ç√ë√ì√ö√ú¬ø?¬°!.,;#:<>()'‚Äú‚Äù\"\s]", ' ', tweet)
    # lowercase everything
    tweet = tweet.lower()
    # reset spaces
    tweet = re.sub("\s+", ' ', tweet)
    tweet = re.sub("^\s+", '', tweet)
    tweet = re.sub("\s$", '', tweet)
    return tweet

In [4]:
import random

# spotchecking loop to check that clean_tweet() works
with open('lid_train_lines.txt', 'r') as input_file:
    for line in input_file:
        if random.random() < 0.0005:
            line = line.strip()
            print(line)
            print(clean_tweet(line))

I want to go to san Antonio !!! and to hill country and san marcos too
i want to go to san antonio !!! and to hill country and san marcos too
@JimmyEsqueda YYAAAAAAAAAAAAAAAAAAAAAAAAAS ! üòÅ but hes gonna bite you üòîüêæ
yyaaaaaaaaaaaaaaaaaaaaaaaaas ! but hes gonna bite you
hoy es el ultimo sleepless sleepover:')
hoy es el ultimo sleepless sleepover:')
I feel horrible :(
i feel horrible :(
@adrianaaleee can u shut up ?
can u shut up ?
JAJAJAJAJAJAJA BYEEEE https://t.co/VnfjEiyasc
jajajajajajaja byeeee
" Houston hasta MX .. compralo vendelo SPM dejalo .. Y fierro en este juego necesitas huevos ! "
" houston hasta mx .. compralo vendelo spm dejalo .. y fierro en este juego necesitas huevos ! "
Sad but true üíÅ
sad but true
üòí



In [5]:
clean_lines = []

with open('lid_train_lines.txt', 'r') as input_file:
    for line in input_file:
        clean_lines.append(clean_tweet(line))

In [6]:
len(clean_lines)

21030

In [7]:
random.seed(a=50)

In [8]:
chosen_tweets = random.choices(clean_lines, k=100)

In [9]:
chosen_tweets

['ma√±ana viernes wilfrido vargas ..',
 'u love me',
 'so original .. .',
 'whats up with nature',
 'aguanta tu ritmo .',
 "i do n't like ( xxx ) people , they always think they 're right and do n't listen to anything else . ( testarudo a obstinado a tercho a )",
 'ni pinches moscas pescas tu mondaooo',
 'new week , be kind and sweet ! vestido sahara',
 'no puedo bregar con la gente stalker',
 'a cantarle las ma√±anitas ala virgencita',
 'el vlog de hoy aunque sea tarde , peque√±as cenas agradables : el d√≠a m√°s feliz de mi vida',
 'all bad',
 'no es sin√≥nimo pero significa lo mismo , i get out of work at 6 pm , creo q esto es lo q quer√≠as decir :)',
 'blusa violeta y sus diferentes estampados ! escoge tu color favorito made in ecuador',
 '¬°¬° nuevo v√≠deo !! en esta ocasi√≥n , un famoso presentador de tv me hace una dur√≠sima entrevista .. . ¬° se .. .',
 'get the hang of it : acostumbrarse a esto poder dominarlo person 1 : this new job is so stressful . person 2 : you ll get the 

In [10]:
for chosen_tweet in chosen_tweets:
    print (chosen_tweet)

ma√±ana viernes wilfrido vargas ..
u love me
so original .. .
whats up with nature
aguanta tu ritmo .
i do n't like ( xxx ) people , they always think they 're right and do n't listen to anything else . ( testarudo a obstinado a tercho a )
ni pinches moscas pescas tu mondaooo
new week , be kind and sweet ! vestido sahara
no puedo bregar con la gente stalker
a cantarle las ma√±anitas ala virgencita
el vlog de hoy aunque sea tarde , peque√±as cenas agradables : el d√≠a m√°s feliz de mi vida
all bad
no es sin√≥nimo pero significa lo mismo , i get out of work at 6 pm , creo q esto es lo q quer√≠as decir :)
blusa violeta y sus diferentes estampados ! escoge tu color favorito made in ecuador
¬°¬° nuevo v√≠deo !! en esta ocasi√≥n , un famoso presentador de tv me hace una dur√≠sima entrevista .. . ¬° se .. .
get the hang of it : acostumbrarse a esto poder dominarlo person 1 : this new job is so stressful . person 2 : you ll get the hang of it .
clase de text
voy a empezar a pedirle trabajos a 

In [11]:
DISACCENT_MAP = {'√°': 'a', '√©': 'e', '√≠': 'i', '√≥': 'o', '√∫': 'u', '√º': 'u', '√±': 'n'}

def drop_accents(tweet: [str], drop_probability: float=1.0) -> [str]:
    output = []
    for c in tweet:
        if c in DISACCENT_MAP and random.random() < drop_probability:
            output.append(DISACCENT_MAP[c])
        else:
            output.append(c)
    return output

In [12]:
example = 'jajajaajaja yo odio la palabra " moci√≥n " con mi vida entera despu√©s de la estupidez de huelga esa'
''.join(drop_accents(example, 0.5))

'jajajaajaja yo odio la palabra " mocion " con mi vida entera despues de la estupidez de huelga esa'

In [13]:
len(clean_lines)

21030

In [14]:
VOWELS_SET = set("aeiou√°√©√≠√≥√∫√º")

def drop_vowels(tweet: [str], drop_probability: float=1.0) -> [str]:
    output = []
    for c in tweet:
        if c in VOWELS_SET and random.random() < drop_probability:
            continue
        output.append(c)
    return output

In [15]:
''.join(drop_vowels(example, 0.1))

'jajajajaja yo odo la palabr " moci√≥n " con m vida entera despu√©s de la estpidez de huelga esa'

In [16]:
def repeat_vowels(tweet: [str], repeat_probability=0.05, max_repeat=6) -> [str]:
    output = []
    for c in tweet:
        if c in VOWELS_SET and random.random() < repeat_probability:
            for _ in range(random.randint(1, max_repeat)):
                output.append(c)
        output.append(c)
    return output

In [17]:
''.join(repeat_vowels(example))

'jajajaajaaaaaaja yo odio la palabra " moci√≥n " con miiii vida entera despu√©s de la estupidez dee huelga esa'

In [18]:
CREATIVE_SUB_MAP = {'c': 's', 's': 'z', 'b': 'v', 'v': 'b'}

def substitute_creatively(tweet: [str], substitution_probability=0.1):
    output = []
    for c in tweet:
        if c in CREATIVE_SUB_MAP and random.random() < substitution_probability:
            output.append(CREATIVE_SUB_MAP[c])
        else:
            output.append(c)
    return output

In [19]:
''.join(substitute_creatively(example))

'jajajaajaja yo odio la palabra " moci√≥n " con mi vida entera despu√©s de la estupidez de huelga eza'

In [20]:
def simulate_creative_misspellings(tweet: [str]) -> [str]:
    if random.random() < 0.7:
        return repeat_vowels(tweet)
    else:
        return substitute_creatively(tweet)

In [21]:
def simulate_intentional_shortenings(tweet: [str]) -> [str]:
    input_tokens = ''.join(tweet).split(' ')
    output_tokens = []
    for token in input_tokens:
        if random.random() < 0.3:
            output_tokens.append(''.join(drop_vowels(list(token), 0.5)))
        else:
            output_tokens.append(token)
    return list(' '.join(output_tokens))

In [22]:
''.join(simulate_intentional_shortenings(example))

'jajajaajaja yo odio l palabra " moci√≥n " con mi vida entera despus d la estupidez de huelga esa'

In [23]:
ALPHABET_L = list("abcdefghijklmnopqrstuvwxyz√°√©√≠√≥√∫√º√±")
ALPHABET_S = set(ALPHABET_L)
print(ALPHABET_L)
print(ALPHABET_S)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '√°', '√©', '√≠', '√≥', '√∫', '√º', '√±']
{'a', 'z', '√º', '√©', 'o', 'i', 'h', 's', 'r', 'l', 'v', 'w', '√≠', 'q', 'd', 'j', '√∫', '√±', 'm', 'x', 'f', '√≥', 'g', 'b', 'c', 'k', 'p', 'u', 't', 'n', '√°', 'y', 'e'}


In [24]:
def simulate_bona_fide_spelling_mistakes(tweet: [str], modify_rate=0.1) -> [str]:
    # equally add, omit, or substitute each character for a total probability of modify_rate
    add_cmf_val = modify_rate / 3
    omit_cmf_val = 2 * modify_rate / 3
    substitute_cmf_val = modify_rate
    
    output = []
    for c in tweet:
        r = random.random()
        if r < add_cmf_val:
            output.append(c)
            output.append(random.choice(ALPHABET_L))
        elif r < omit_cmf_val:
            continue
        elif c in ALPHABET_S and r < substitute_cmf_val:
            # we don't apply substitutions to non-alpha
            output.append(random.choice(ALPHABET_L))
        else:
            output.append(c)
    return output

In [25]:
''.join(simulate_bona_fide_spelling_mistakes(example))

'jajajaajaja vo odio la palabra " moci√≥n " con mi vida etea despu√©s dte lx esteupidez de huega esa'

In [26]:
def identity(tweet: [str]) -> [str]:
    return tweet

In [27]:
DEFAULT_WEIGHT_MAP = [[14, simulate_creative_misspellings],
                      [13, drop_accents],
                      [10, simulate_intentional_shortenings],
                      [8, simulate_bona_fide_spelling_mistakes],
                      [10, identity]]

class DisjointNoiser:
    def __init__(self, weight_map=DEFAULT_WEIGHT_MAP):
        self.weight_map = weight_map
        total = sum([self.weight_map[i][0] for i in range(len(self.weight_map))])
        # we normalize the pmf
        for i in range(len(self.weight_map)):
            self.weight_map[i][0] /= total
        # we make the pmf into a cmf
        for i in range(1, len(self.weight_map) - 1):
            self.weight_map[i][0] += self.weight_map[i - 1][0]
        self.weight_map[-1][0] = 1.0  # so as not to worry about rounding errors

    def add_noise(self, tweet: [str]) -> [str]:
        """Applies at most 1 kind of noising to the tweet according to the weights in the weight map.
        Each 'noising' could alter the tweet in multiple places or not at all.
        """
        tweet = list(tweet)
        my_random_number = random.random()
        for max_prob, noise_function in self.weight_map:
            if my_random_number < max_prob:
                # print('calling', noise_function)
                return noise_function(tweet)

In [28]:
noiser = DisjointNoiser()
for _ in range(10):
    noiser.add_noise('abcd')

In [29]:
noiser = DisjointNoiser()
for _ in range(10):
    print(''.join(noiser.add_noise(example)))
    print()

jjajjaja yo odio la palabra " moci√≥n " con mi vida entera despu√©s de la estupidez de huelga esa

jajajaajaja yo odio l palabra " moci√≥n " con m vida entera dspu√©s de la stpidz de huelga esa

jajajaajaja yo odio la palabra " moci√≥n " con mi vida entera despu√©s de la estupidez de huelga esa

jajajaajaja yo odio la palabra " mocion " con mi vida entera despues de la estupidez de huelga esa

jajajaajaja yo odio la palabra " moci√≥n " con mi vida entera despu√©s de la estupidez de huelga esa

jajajaajaja yo odio la palabra " moci√≥n " con mi vida entera despu√©s de la estupidez de huelga esa

jajajajfja yo odio la ypalabra " oci√≥n " con mi vviada etera despu√©s de la estufpidez de huelga qsa

jajajaajaja yo odio la palabra " mci√≥n " con mi vd entera desps de la estupidez de hulga esa

jjajaajajay yo di√° la alabra " moci√≥n " con mi vida entera despu√©s de la estoupidez de huelga es

jajajaajavja y odiolj plabra " moci√≥n c" csn mi visa entera d√±espu√© de la estupidez de helga xsa


In [30]:
len(clean_lines)

21030

In [31]:
max(len(line) for line in clean_lines)

154

In [32]:
sum(len(line) for line in clean_lines) / len(clean_lines)

49.6810746552544

In [33]:
clean_medium_lines = [line for line in clean_lines if 10 < len(line) < 100]

In [34]:
len(clean_medium_lines)

16908

In [35]:
VOCAB_L = ALPHABET_L + list("0123456789¬ø?¬°!.,;#:<>()'‚Äú‚Äù\" ") + ['<GO>', '<EOT>', '<PAD>']
print(VOCAB_L)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '√°', '√©', '√≠', '√≥', '√∫', '√º', '√±', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '¬ø', '?', '¬°', '!', '.', ',', ';', '#', ':', '<', '>', '(', ')', "'", '‚Äú', '‚Äù', '"', ' ', '<GO>', '<EOT>', '<PAD>']


In [36]:
VOCAB_TO_INT = {v: i for i, v in enumerate(VOCAB_L)}

In [37]:
NUM_TOKENS = len(VOCAB_TO_INT)

In [38]:
example_ints = [VOCAB_TO_INT[c] for c in example]

In [39]:
print(example_ints)

[9, 0, 9, 0, 9, 0, 0, 9, 0, 9, 0, 60, 24, 14, 60, 14, 3, 8, 14, 60, 11, 0, 60, 15, 0, 11, 0, 1, 17, 0, 60, 59, 60, 12, 14, 2, 8, 29, 13, 60, 59, 60, 2, 14, 13, 60, 12, 8, 60, 21, 8, 3, 0, 60, 4, 13, 19, 4, 17, 0, 60, 3, 4, 18, 15, 20, 27, 18, 60, 3, 4, 60, 11, 0, 60, 4, 18, 19, 20, 15, 8, 3, 4, 25, 60, 3, 4, 60, 7, 20, 4, 11, 6, 0, 60, 4, 18, 0]


In [40]:
def pad_tweet_batch(tweet_batch, max_tweet_length_in_batch):
    """Pad tweets with <PAD> so that each sentence of a batch has the same length"""
    return [tweet + [VOCAB_TO_INT['<PAD>']] * (max_tweet_length_in_batch - len(tweet)) for tweet in tweet_batch]

In [41]:
def one_hot_encode(padded_batch):
    encoded_data = np.zeros(
        (len(padded_batch), len(padded_batch[0]), NUM_TOKENS), dtype="float32"
    )
    
    for i, padded_text in enumerate(padded_batch):
        for t, int_value in enumerate(padded_text):
            encoded_data[i, t, int_value] = 1.0
    
    return encoded_data

In [42]:
def get_batches(tweets, noiser, batch_size):
    
    for batch_i in range(0, len(tweets) // batch_size):
        start_i = batch_i * batch_size
        tweets_batch = tweets[start_i:start_i + batch_size]
        tweets_batch_noised = [noiser.add_noise(tweet) for tweet in tweets_batch]
        
        tweets_batch_ints = [[VOCAB_TO_INT[v] for v in tweet] for tweet in tweets_batch]
        tweets_batch_noised_ints = [[VOCAB_TO_INT[v] for v in tweet] for tweet in tweets_batch_noised]
        
        tweets_batch_eot = [tweet + [VOCAB_TO_INT['<EOT>']] for tweet in tweets_batch_ints]
        tweets_batch_delayed_eot = [[VOCAB_TO_INT['<GO>']] + tweet + [VOCAB_TO_INT['<EOT>']] for tweet in tweets_batch_ints]
        tweets_batch_noised_eot = [tweet + [VOCAB_TO_INT['<EOT>']] for tweet in tweets_batch_noised_ints]
        
        pad_tweets_batch = np.array(pad_tweet_batch(tweets_batch_eot,
                                                    1 + max([len(tweet) for tweet in tweets_batch_eot])))
        pad_tweets_delayed_batch = np.array(pad_tweet_batch(tweets_batch_eot,
                                                    max([len(tweet) for tweet in tweets_batch_delayed_eot])))
        pad_tweets_noised_batch = np.array(pad_tweet_batch(tweets_batch_noised_eot,
                                                           max([len(tweet) for tweet in tweets_batch_noised_eot])))
        
        pad_tweets_encoded_batch = one_hot_encode(pad_tweets_batch)
        pad_tweets_delayed_encoded_batch = one_hot_encode(pad_tweets_delayed_batch)
        pad_tweets_noised_encoded_batch = one_hot_encode(pad_tweets_noised_batch)
        
        yield pad_tweets_noised_encoded_batch, pad_tweets_encoded_batch, pad_tweets_delayed_encoded_batch

In [43]:
clean_sorted_medium_lines = sorted(clean_medium_lines, key=lambda t: len(t))

In [44]:
BATCH_SIZE = 4

gb = get_batches(clean_sorted_medium_lines, noiser, batch_size=BATCH_SIZE)

In [45]:
next(gb)

(array([[[0., 0., 1., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 1., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 1., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 1., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [1., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0

In [46]:
# pad_tweets_noised_batch, pad_tweets_batch, pad_tweets_noised_lengths, pad_tweets_lengths = next(gb)

In [47]:
pad_tweets_noised_batch, pad_tweets_batch, pad_tweets_delayed_batch = next(gb)

In [48]:
pad_tweets_noised_batch

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.]]], dtyp

In [49]:
pad_tweets_batch

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]]], dtyp

In [50]:
pad_tweets_delayed_batch

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]]], dtyp

In [51]:
len(VOCAB_TO_INT)

64

In [52]:
VOCAB_TO_INT['a']

0

In [53]:
VOCAB_TO_INT['b']

1

In [54]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, LSTM, RepeatVector

In [55]:
# model = Sequential()

# DROPOUT = RECURRENT_DROPOUT = 0.3
# HIDDEN_SIZE = 50

# encoder_inputs = keras.Input(shape=(None, len(VOCAB_TO_INT)))
#                kernel_initializer="he_normal", dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT))

# # output_len = 12

# # model.add(RepeatVector(output_len))

# LAYERS = 2

# for _ in range(LAYERS):
#     model.add(LSTM(HIDDEN_SIZE, return_sequences=True,
#               kernel_initializer="he_normal", dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT))

# model.add(Dense(len(VOCAB_TO_INT), kernel_initializer="he_normal"))

# model.add(Activation('softmax'))

# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [56]:
latent_dim = 256

# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, NUM_TOKENS))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, NUM_TOKENS))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(NUM_TOKENS, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [57]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [58]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 64)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 64)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 328704      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  328704      input_2[0][0]                    
                                                                 lstm[0][1]                   

In [59]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

In [60]:
tf.config.run_functions_eagerly(True)

In [61]:
len(clean_sorted_medium_lines)

16908

In [62]:
batches_gen = get_batches(clean_sorted_medium_lines, noiser, batch_size=100)

In [63]:
tf.config.run_functions_eagerly(True)

In [64]:
i = 0
while True:
    try:
        noised_tweets_batch, original_tweets_batch, original_tweets_delayed_batch = next(batches_gen)
        model.fit([noised_tweets_batch, original_tweets_delayed_batch], original_tweets_batch,
                  steps_per_epoch=10, epochs=1)
    except StopIteration:
        print('StopIteration')
        break
    print(i)
    i += 1

  "Even though the `tf.config.experimental_run_functions_eagerly` "


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88


89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
StopIteration


In [65]:
model.save("one_pass.h5")

In [102]:
model = keras.models.load_model("one_pass.h5")

In [103]:
encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,), name="decoder_state_input_h")
decoder_state_input_c = keras.Input(shape=(latent_dim,), name="decoder_state_input_c")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

In [104]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in VOCAB_TO_INT.items())
reverse_target_char_index = reverse_input_char_index

In [105]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # print('states values:', states_value)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, NUM_TOKENS))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, VOCAB_TO_INT['<GO>']] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > 100:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, NUM_TOKENS))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [106]:
batches_gen = get_batches(clean_sorted_medium_lines, noiser, batch_size=1)
for i in range(3):
    noised_tweets_batch, original_tweets_batch, original_tweets_delayed_batch = next(batches_gen)
    print("target clean tweet:", clean_sorted_medium_lines[i])
    decoded_tweet = decode_sequence(noised_tweets_batch)
    print("Decoded tweet:", decoded_tweet)

target clean tweet: cuando no ?


  "Even though the `tf.config.experimental_run_functions_eagerly` "


Decoded tweet: ¬°¬°¬°¬°¬°¬°¬°¬°<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
target clean tweet: graciass :)
Decoded tweet: ¬°¬°¬°¬°¬°¬°¬°¬°<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
target clean tweet: jajajajajja
Decoded tweet: ¬°¬°¬°¬°¬°¬°¬°¬°<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
