In [695]:
import pandas as pd
import numpy as np
import re

In [781]:
VECTOR_SIZE = 50
INPUT_TOKEN_SIZE = 5
WINDOW_SIZE = 5

In [782]:
data = pd.read_csv('data/new_locotamerzoa_tweets.csv').text.tolist()

In [783]:
def filter_rt(string):
    return string.startswith("RT")

def strip_replies(string):
    string = re.sub('\@\w+', '', string)
    string = re.sub('http[s]*\:\S+', '', string)
    return string

def strip_spaces(string):
    return " ".join(string.strip().split())

def filter_data(data):
    data = [el for el in data if not filter_rt(el)]
    data = [strip_replies(el) for el in data]
    data = [strip_spaces(el) for el in data]
    return data

In [784]:
filter_data(data)

['🥵🥵🥵',
 'foda q o pescoço..',
 '12:39 ainda nao reclamei de nada libertador ...',
 'cade',
 'simmm',
 '🥵🥵🥵🥵',
 'a sua nao ouvi o desfecho da sua boca!!',
 'um brinde a minha intuição 🥂',
 'eh a lei do duende',
 'vc tem o mesmo nome da gata da minha vó',
 'todo cabofriense tem cara de que Perdeu Tudo na gas consultoria',
 'VAI TOMAR NO CUUUUU',
 'u iu ii. i iuuiii',
 'bot:prompt 2pac mushroom boy',
 '😢😢😢😢😢👏🏼👏🏼👏🏼👏🏼👏🏼',
 'amoo claudinha minha orientadora o design desistiu dela ou ela desistiu do design',
 'quem eh esse tal de edipo ele quer me comer',
 'nem no job novo porra!!',
 'ai joga pro místico e firma com os olhos "espelho quebrado anos de azar"',
 'coloque-os ali no lugar de pés',
 'eh um convite a auto reflexão',
 'dona esse fogao ta quase bom so nao presta 3 boca',
 'desculpe senhora Nao gosto de pes',
 'eu a cada 3 meses',
 'amo vc!!! e seu pombo novo',
 'eu ami essa frase',
 'homem eh igual fone de ouvido comprei um airdots da redmi falso e fui ressarcido pelo marketplace da 

In [785]:
filtered = filter_data(data)

In [786]:
print('sentences', len(data))

sentences 3217


In [787]:
tokens = set()
for sentence in filtered:
    for token in sentence.split():
        tokens.add(token)
print('unique tokens', len(tokens))

unique tokens 6734


In [788]:
tokens = 0
for sentence in filtered:
    for token in sentence.split():
        tokens += 1
print('tokens', tokens)

tokens 20760


In [789]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

tokenize = [el.split() for el in filtered]
word2vecmodel = Word2Vec(sentences=tokenize, vector_size=VECTOR_SIZE, window=WINDOW_SIZE, min_count=1, workers=4)
word2vecmodel.save("word2vec.model")

In [790]:
word2vecmodel.wv["rodrigo"]

array([-1.92692149e-02, -4.92732972e-04,  7.13205989e-03,  4.98202862e-03,
        6.38224510e-03, -3.08877532e-03,  1.76255517e-02, -9.02063213e-03,
        5.02520939e-03,  1.80281624e-02, -1.17477505e-02, -1.43473679e-02,
       -2.87673157e-03, -1.02068707e-02, -3.86691443e-03, -8.15747990e-06,
       -2.37499597e-04, -8.26955959e-03,  4.54729749e-03,  6.76459027e-03,
       -1.26251122e-02, -1.50042784e-03,  4.63921065e-03, -1.51749386e-03,
       -1.24564786e-02, -1.03594153e-03,  4.41692118e-03, -3.47987260e-03,
        9.35728941e-03, -1.33218849e-02, -1.14282435e-02,  8.55650287e-03,
        1.83472261e-02,  1.60617486e-03, -1.00993374e-02,  1.44411996e-02,
        2.24970188e-03,  1.47100259e-03,  1.08733065e-02, -2.06863065e-03,
       -1.73499179e-03, -7.80369574e-03,  1.11945262e-02, -1.19739026e-02,
       -2.33176490e-03,  5.54237096e-03,  1.94345263e-03, -5.17491484e-04,
        1.64811220e-02,  2.00692229e-02], dtype=float32)

In [791]:
word2vecmodel.wv.most_similar('tamer', topn=10)

[('milenares', 0.5590617060661316),
 ('chicago', 0.48981714248657227),
 ('ciclo', 0.4631543755531311),
 ('letras', 0.4432130753993988),
 ('LIVRE', 0.43657276034355164),
 ('explica', 0.42867186665534973),
 ('abobado', 0.4201488196849823),
 ('vcs!!', 0.41772836446762085),
 ('abuso', 0.4150388240814209),
 ('fom', 0.4141755700111389)]

In [792]:
def to_input(tokens):
    inputs = []
    for i in range(0, len(tokens)-INPUT_TOKEN_SIZE, 1):
        inputs.append(tokens[i:i+INPUT_TOKEN_SIZE+1])
    return inputs

In [793]:
tokens_sentence = []
for token in tokenize:
    tokens_sentence.extend(to_input(token))

In [794]:
tokens_sentence

[['12:39', 'ainda', 'nao', 'reclamei', 'de', 'nada'],
 ['ainda', 'nao', 'reclamei', 'de', 'nada', 'libertador'],
 ['nao', 'reclamei', 'de', 'nada', 'libertador', '...'],
 ['a', 'sua', 'nao', 'ouvi', 'o', 'desfecho'],
 ['sua', 'nao', 'ouvi', 'o', 'desfecho', 'da'],
 ['nao', 'ouvi', 'o', 'desfecho', 'da', 'sua'],
 ['ouvi', 'o', 'desfecho', 'da', 'sua', 'boca!!'],
 ['um', 'brinde', 'a', 'minha', 'intuição', '🥂'],
 ['vc', 'tem', 'o', 'mesmo', 'nome', 'da'],
 ['tem', 'o', 'mesmo', 'nome', 'da', 'gata'],
 ['o', 'mesmo', 'nome', 'da', 'gata', 'da'],
 ['mesmo', 'nome', 'da', 'gata', 'da', 'minha'],
 ['nome', 'da', 'gata', 'da', 'minha', 'vó'],
 ['todo', 'cabofriense', 'tem', 'cara', 'de', 'que'],
 ['cabofriense', 'tem', 'cara', 'de', 'que', 'Perdeu'],
 ['tem', 'cara', 'de', 'que', 'Perdeu', 'Tudo'],
 ['cara', 'de', 'que', 'Perdeu', 'Tudo', 'na'],
 ['de', 'que', 'Perdeu', 'Tudo', 'na', 'gas'],
 ['que', 'Perdeu', 'Tudo', 'na', 'gas', 'consultoria'],
 ['amoo', 'claudinha', 'minha', 'orientadora',

In [795]:
def to_vec(tokens, model):
    return [model.wv[token] for token in tokens]

In [796]:
tokens_sentence = np.array([np.array(to_vec(tokens, word2vecmodel)) for tokens in tokens_sentence])

In [797]:
tokens_sentence.shape

(10012, 6, 50)

In [798]:
X = tokens_sentence[:, :INPUT_TOKEN_SIZE]

In [799]:
y = tokens_sentence[:, INPUT_TOKEN_SIZE]

In [800]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM

In [826]:
model = Sequential()
model.add(LSTM(64, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(64))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1]))

In [827]:
model.compile(loss='mse', optimizer='adam')

In [None]:
model.fit(X, y, epochs=100, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100

In [813]:
X_ = np.array([X[3], X[45], X[880]])
y_ = np.array([y[3], y[45], y[880]])
pred = model.predict(X_)

In [814]:
pred.shape

(3, 50)

In [815]:
def vec_to_word(array):
    sentences = []
    for sentence in array:
        words = []
        for word in sentence:
            x = word2vecmodel.wv.most_similar(positive=[word], topn=1)
            words.append(x[0][0])
        sentences.append(words)
    return sentences

In [816]:
vec_to_word(X_)

[['a', 'sua', 'nao', 'ouvi', 'o'],
 ['fogao', 'ta', 'quase', 'bom', 'so'],
 ['amor', 'dedicado', 'a', 'pessoa', 'q']]

In [817]:
vec_to_word([pred])

[['de', 'de', 'de']]

In [818]:
vec_to_word([y_])

[['desfecho', 'nao', 'criou']]

In [823]:
def generate_tweet():
    sentence = X[np.random.randint(0, len(X))]
    for _ in range(10):
        print(vec_to_word([sentence]))
        pred = model.predict(np.array([sentence[-3:]]))
        x = word2vecmodel.wv.most_similar(positive=pred, topn=10)
        p = [i[1] for i in x]
        s = sum(p)
        p = [i/s for i in p]
        word = np.random.choice([i[0] for i in x], p=p)
#         word = x[0][0]
        pred = np.array([word2vecmodel.wv[word]])
        sentence = np.concatenate((sentence, np.array(pred)), axis=0)
    return vec_to_word([sentence])

In [825]:
generate_tweet()

[['um', 'cara', 'q', 'sempre', 'gostei']]
[['um', 'cara', 'q', 'sempre', 'gostei', 'de']]
[['um', 'cara', 'q', 'sempre', 'gostei', 'de', 'de']]
[['um', 'cara', 'q', 'sempre', 'gostei', 'de', 'de', 'e']]
[['um', 'cara', 'q', 'sempre', 'gostei', 'de', 'de', 'e', 'a']]
[['um', 'cara', 'q', 'sempre', 'gostei', 'de', 'de', 'e', 'a', 'na']]
[['um', 'cara', 'q', 'sempre', 'gostei', 'de', 'de', 'e', 'a', 'na', 'e']]
[['um', 'cara', 'q', 'sempre', 'gostei', 'de', 'de', 'e', 'a', 'na', 'e', 'que']]
[['um', 'cara', 'q', 'sempre', 'gostei', 'de', 'de', 'e', 'a', 'na', 'e', 'que', 'de']]
[['um', 'cara', 'q', 'sempre', 'gostei', 'de', 'de', 'e', 'a', 'na', 'e', 'que', 'de', 'que']]


[['um',
  'cara',
  'q',
  'sempre',
  'gostei',
  'de',
  'de',
  'e',
  'a',
  'na',
  'e',
  'que',
  'de',
  'que',
  'que']]