In [13]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import numpy as np 

In [14]:
tokenizer = Tokenizer()

def dataset_preparation(data):

	# basic cleanup
	corpus = data.lower().split("\n")

	# tokenization	
	tokenizer.fit_on_texts(corpus)
	total_words = len(tokenizer.word_index) + 1

	# create input sequences using list of tokens
	input_sequences = []
	for line in corpus:
		token_list = tokenizer.texts_to_sequences([line])[0]
		for i in range(1, len(token_list)):
			n_gram_sequence = token_list[:i+1]
			input_sequences.append(n_gram_sequence)

	# pad sequences 
	max_sequence_len = max([len(x) for x in input_sequences])
	input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

	# create predictors and label
	predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
	label = ku.to_categorical(label, num_classes=total_words)

	return predictors, label, max_sequence_len, total_words

def create_model(predictors, label, max_sequence_len, total_words):
	
	model = Sequential()
	model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
	model.add(LSTM(150, return_sequences = True))
	# model.add(Dropout(0.2))
	model.add(LSTM(100))
	model.add(Dense(total_words, activation='softmax'))

	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
	model.fit(predictors, label, epochs=100, verbose=1, callbacks=[earlystop])
	model.summary()
	return model 

def generate_text(seed_text, next_words, max_sequence_len):
	for _ in range(next_words):
		token_list = tokenizer.texts_to_sequences([seed_text])[0]
		token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
		predicted = model.predict_classes(token_list, verbose=0)
		
		output_word = ""
		for word, index in tokenizer.word_index.items():
			if index == predicted:
				output_word = word
				break
		seed_text += " " + output_word
	return seed_text

In [32]:
data = open(r'C:\Users\rlee3104\Desktop\SOP\SOP_DATASET.txt').read()



In [19]:
predictors, label, max_sequence_len, total_words = dataset_preparation(data)


In [20]:
model = create_model(predictors, label, max_sequence_len, total_words)
#print generate_text("we naughty", 3, max_sequence_len)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
 1312/10

KeyboardInterrupt: 

In [21]:
import gc

gc.collect()

164

In [22]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

In [33]:
#path = open(r'C:\Users\rlee3104\Desktop\SOP\SOP_DATASET.txt').read()

text = data.lower()
print('corpus length:', len(text))


corpus length: 105617


In [34]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 60


In [35]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))


nb sequences: 35193


In [36]:
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [37]:
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [38]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [39]:

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)


In [None]:
model.fit(x, y,
          batch_size=32,
          epochs=60,
          callbacks=[print_callback])

Epoch 1/60

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "p conferences in computer science.
by th"
p conferences in computer science.
by the end of the summer by the summitly
ingloge of the summer by the prompronal repors course cort management the fot the data to an and isforlance in my experiences hugang and my leare me of software engineering.
experiences have the end of the summ

  after removing the cwd from sys.path.


ut sanding the resign and hcilo sumber seculed me the fou game the summer by the promiled am students and helped me to such as
inthort in the results. thu
----- diversity: 0.5
----- Generating with seed: "p conferences in computer science.
by th"
p conferences in computer science.
by the end of the summut symects us to like themphass land industry ending of the summer by the promition with the conter and
i adea of throuse industry stimps and dradations proding and how of pers. to end the summer by sidsers and data mainterently i we haduate cont workers and data millit the contures and
academic techniques, beselplanipess by the fotudate the impact of a hourd sciented to submit th
----- diversity: 1.0
----- Generating with seed: "p conferences in computer science.
by th"
p conferences in computer science.
by the pays on the prom
dr. theselpinted to impeces, and madvenss moning the numby the end of the semes of creative elusted the   philoss from with to
dnotloly by solile to fiond stoht 

silpatedion we gain yo lemrnugateve tavinue my group provide canding and senion of thraugh an litext syme on , usefotect.
tot this fiods ffow thes interach prometion
to tacal of what experting the underbaction senuy sering bacallych. yet. i rofacly chance groups for a the atoble concing an advanced emplons several samperchumeht, of dr. nethonfing a whll. i was a
appli, an a sebile 
Epoch 5/60

----- Generating text after Epoch: 4
----- diversity: 0.2
----- Generating with seed: " and also enhanced my confidence among p"
 and also enhanced my confidence among prong the results, i was to forlorking of the paraluter and commucation of comple software engineering.
experience.

the development to the university of madvered students at a graduate school at my exposs senter the end of the semester-loge, and communitot of the paraliem at how the design and ckithed in students and helped students at the end of the semester, i have concervice in software enginee
----- diversity: 0.5
----- Genera

i gamesssuctor was to leadeent in the world sime technicaly shared and the tow retaring developeesswig, a professorded by dr. paul and ourt and time reunded students. we work of software development tech-ining my gional recessood a chareent while with a strong development, and rour the applicational sachies to activitor
----- diversity: 1.0
----- Generating with seed: "ical lead, and my main responsibility wa"
ical lead, and my main responsibility wan while ma jobn a graduate succeed and widrtted the operected very acodeds and databas and experiencesed sifh
to research researchfirions and abstonated a held tow coursefical leand using my process of creating and imbeed to mosieng my firtien anadaged langunicative my interests and ef owrovel.
begwern intran the resiale, to the applicated in software engineering in sefuct under echiens emfive cou
----- diversity: 1.2
----- Generating with seed: "ical lead, and my main responsibility wa"
ical lead, and my main responsibility was was communi


----- Generating text after Epoch: 11
----- diversity: 0.2
----- Generating with seed: "ent that i will uphold the reputation of"
ent that i will uphold the reputation of etemester software to a to the networks. we will reoust longer students baseeme in the university, i well are to academic conferences at a graduate program lourd our
parasial states and development testrenized to submit the end of the sememe, test-drivench my conference. this evely involved my academic research in software engineering research at uiuc
it also the completed that which to accest my
----- diversity: 0.5
----- Generating with seed: "ent that i will uphold the reputation of"
ent that i will uphold the reputation of interest in software engineering research program in computer science tikn enftiessed in the had be apply to the netbed eepore the computerncephical research program at ban the interest in software engineering research at benation to develop more eventwrong which altomality, i wours this during

demic focus is software engineering. my process of the provide to my intoruce the project and collong of naculing it. dig a research to code in thus has coales and com, a matamautech. my couct my chaire, i open 
fbocl
during research it belos. department-coneen of this activitiel interests of its intere to both acpour open software engineering.
experiencessfore the use
attending the topente. in this endution ssject it
to the n dostribut
----- diversity: 1.2
----- Generating with seed: "demic focus is software engineering. my "
demic focus is software engineering. my applicaliling. i houp tiels ouchiply spocide  this fillohe  appoupsu put, i ouncapd ginetion nelficisf mior, i haire cless, i have fusmit and mse my project of
comuul itcien science, i langnagep clates of subile the data manageres on end of thriess used by ourted for jave data this ibles. as a research -thopley to
vatureat is would coursesorive clatich. in simper king huad 
a way the coubictide to
Epoch 16/60

----- Generat

eristy of minnesota, i wrote data analysis and profersor my firars our requareen to reverings with the use the used by millions were to academic conferences in computer science and plans, we will reparama and programming, while i was or activities and helped me recepicamet a paranuses in software engineering. my project to the university mently being prompted me to the summers i that i have base computer science and plans the users as a
----- diversity: 0.5
----- Generating with seed: "eristy of minnesota, i wrote data analys"
eristy of minnesota, i wrote data analysis of empirical and mist the propinces in wayo. the summer
by stound goarment.
subjectias was the area the mysime, to acceping the experiences have skits in enuviluse me
the sime to solution process of parailiting an computer science for sobrientation is the end of the summer was acportion were hay work as a prenation for a prodical savion for work allike my ager. of software is mathemplesis the u
----- diversity: 1.0
-----