In [157]:
# import libraries
import time
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import zipfile
import os
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, LSTM, Dense, Dropout, Embedding, Conv1D, MaxPooling1D, BatchNormalization, TimeDistributed, Flatten

from sklearn.model_selection import train_test_split

In [None]:
#these two lines only in colab
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
pfad = "C:/Users/sebas/Downloads/1daee22f3f13fe6bc6a343f829565759-3511dc6de6a7bf064c168b4f20b85a20d8f83b91.zip"
with zipfile.ZipFile(pfad, 'r') as zip_ref:
    zip_ref.extractall('C:/Users/sebas/Downloads/')

In [None]:
folder = '/content/gdrive/My Drive/corpus/'

In [None]:
folder = ('C:\\Users\\sebas\\Google Drive\\corpus\\').replace('\\','/') #this only in local device

In [128]:
filenames = os.listdir(folder)
print(filenames)

text_file = 'top-10000-passwords.txt' #'yahoo.txt' #'cien_annos_soledad.txt'

['PRUEBAW2V.txt', 'cien-años-de-soledad.txt', 'articulo-wikipedia.txt', 'cat-and-dogs.txt', 'el-principito.txt', 'wikipedia_perro.txt', 'la-isla-del-tesoro.txt', 'jesica-cardiologia.txt', 'el-quijote.txt', 'los-tres-mosqueteros.txt', 'cien_annos_soledad.txt', 'Texto.txt', 'yahoo.txt', 'top-10000-passwords.txt']


In [None]:

with open(folder+'Texto.txt', 'w') as outfile:
    for fname in filenames:
        if fname != 'Texto.txt':
            with open(folder+fname, encoding='utf-8') as infile:
                for line in infile:
                    outfile.write(line)

In [129]:
#text = open(folder+'la_biblioteca_de_babel.txt', encoding = 'utf-8').read().lower()
text = open(folder+text_file, encoding = 'utf-8').read().lower()
text = text.split() #this makes sure to pick up the words
text = ' '.join(text) #this takes all the letters without the separators

In [130]:
corpus_length = len(text) #number of words
print('Number of words:',corpus_length)

Number of words: 76507


In [131]:
text[:600]

'123456 password 12345678 qwerty 123456789 12345 1234 111111 1234567 dragon 123123 baseball abc123 football monkey letmein 696969 shadow master 666666 qwertyuiop 123321 mustang 1234567890 michael 654321 pussy superman 1qaz2wsx 7777777 fuckyou 121212 000000 qazwsx 123qwe killer trustno1 jordan jennifer zxcvbnm asdfgh hunter buster soccer harley batman andrew tigger sunshine iloveyou fuckme 2000 charlie robert thomas hockey ranger daniel starwars klaster 112233 george asshole computer michelle jessica pepper 1111 zxcvbn 555555 11111111 131313 freedom 777777 pass fuck maggie 159753 aaaaaa ginger p'

In [None]:
#clean text
text = text[120:-142]

In [132]:
 #number of distinct characters
chars = sorted(list(set(text)))
vocab_size = len(chars)
print('Number of unique characters: ', vocab_size)

Number of unique characters:  42


In [133]:
maxlen = 100 #max length of characters in each sentence 
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))

Number of sequences: 25469


In [135]:
print('Sentence: {} \nCompletition: {}'.format(sentences[1],next_chars[1])) #example of text completition of three characters

Sentence: 456 password 12345678 qwerty 123456789 12345 1234 111111 1234567 dragon 123123 baseball abc123 footb 
Completition: a


In [136]:
#dictionary from characters to indices:
char_indices = {char: chars.index(char) for char in chars}

In [137]:
#array of different available characters:
char_array = np.array(chars)
print(char_array)

[' ' '*' '-' '.' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' '?' '_' 'a' 'b'
 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r' 's' 't'
 'u' 'v' 'w' 'x' 'y' 'z']


In [138]:
#array of encoded characters according to the dictionary:
chars_encoded = np.array([char_indices[ch] for ch in char_array], dtype='int32')
print(chars_encoded)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41]


In [139]:
#dictionary from indices to characters:
indices_char = {idx: chars[idx] for idx in chars_encoded}

In [140]:
#encoding of the whole text:
text_encoded = np.array([char_indices[ch] for ch in text], dtype='int32')

In [141]:
#the encoded sentences represent the inputs:
sentences_encoded = []
for sentence in sentences:
    sentences_encoded.append([char_indices[ch] for ch in sentence])
    
sentences_encoded = np.array(sentences_encoded, dtype = 'int32')

In [142]:
#encoding of the complementary characters (target)
next_chars_encoded = [char_indices[ch] for ch in next_chars]

In [143]:
#this is going to be the target variable:
next_chars_encoded = np.array(next_chars_encoded, dtype = 'int32')

In [144]:
#another approach for splitting the data:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = split_input_target(text)

In [183]:
#variables for training:
X = sentences_encoded
y = next_chars_encoded

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 1) #better not to split (optional!!)

In [187]:
#create the model
model = tf.keras.Sequential()
model.add(Embedding(input_dim = vocab_size + 2 , output_dim = 256)) #optional = maxlen+2, 256, 1024 units
model.add(GRU(512))
model.add(Dense(vocab_size)) #it is better to return the logits for numerical stability.

model.compile(optimizer='adam', loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])
model.summary()

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, None, 256)         11264     
_________________________________________________________________
gru_28 (GRU)                 (None, 512)               1182720   
_________________________________________________________________
dense_25 (Dense)             (None, 42)                21546     
Total params: 1,215,530
Trainable params: 1,215,530
Non-trainable params: 0
_________________________________________________________________


In [188]:
callbacks=tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [189]:
#history = model.fit(X_train, y_train, epochs = 20, initial_epoch=10, validation_data=(X_test, y_test) ,batch_size=128, callbacks=callbacks)
history = model.fit(X, y, epochs = 20, batch_size=128, callbacks=callbacks) #this works way better

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [190]:
score = model.evaluate(X,y)



In [191]:
def sample(model, starting_string, len_generated_text=100, max_input_length=maxlen, scale_factor=1.0):
    encoded_input = [char_indices[char] for char in starting_string]
    encoded_input = tf.reshape(encoded_input,(1,-1)) #convert to row vector
    
    generated_string = starting_string
    
    model.reset_states() #reset hidden states of the RNN
    
    for i in range(len_generated_text):
        logits = model(encoded_input)*scale_factor
        #logits = tf.squeeze(logits,0) 
        new_char_index = tf.random.categorical(logits=logits, num_samples = 1) #create random samples out of the logits distribution
        new_char_index = tf.squeeze(new_char_index).numpy() #eliminate redundant dimension and transform to np format
        
        generated_string += char_array[new_char_index]
        
        encoded_input = [char_indices[char] for char in generated_string ]
        encoded_input = tf.reshape(encoded_input,(1,-1))
        
    return generated_string
        

In [199]:
tf.random.set_seed(1)
start = time.time()
temperature = 0.3
generated_words = sample(model, starting_string = 'jessica', scale_factor = 1/temperature, len_generated_text=1000)
end = time.time()-start

In [200]:
print('Execution time is {:.2f} seconds'.format(end))

Execution time is 9.74 seconds


In [201]:
generated_words

'jessica jessica hegger gang goodley greema eggett ellister digger danter crank strees sprille sparser sport1 samana sungor suner123 salmin pooker patton pang laurine love123 karken kani keenan just4man juntim hordin gillia golyy fascove funille dighin carbull 1990 0805 05051985 05051985 04051987 02091986 02091979 02061976 02091979 02061976 02051975 02011976 01031984 zhussy zong yyny yanner walker vinesou1 wjebbbbb wandare tandra wassword zanda veeter viver tenner thoder sherra senano searman shano sanny ranner qwert1234 pusky123 park pipple pipple pitalin pernay manive maciau marijal latrix loveste kronie killay karlin golffibl floddo finner ding diggine dovon damon chan chell charless cappine butterf boodie boodie buller blader baner banner barter aster aller anther anow1 aliva alica aligat aurica alica aliging abrick aldiss alessana asseady a1234567 123456789 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234

In [202]:
a = open(folder+'contras.txt', 'w+')
a.write(generated_words)
a.close()

In [None]:
model.save('/content/gdrive/My Drive/text_gen_sp2.h5')