# Load Dataset

In [1]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


In [2]:
raw_text = load_doc('dataset/ed_sheraan.txt')

In [3]:
print(raw_text)

The club isn't the best place to find a lover
So the bar is where I go (mmmm)
Me and my friends at the table doing shots
Drinking fast and then we talk slow (mmmm)
And you come over and start up a conversation with just me
And trust me I'll give it a chance now (mmmm)
Take my hand, stop, put Van The Man on the jukebox
And then we start to dance
And now I'm singing like
Girl, you know I want your love
Your love was handmade for somebody like me
Come on now, follow my lead
I may be crazy, don't mind me
Say, boy, let's not talk too much
Grab on my waist and put that body on me
Come on now, follow my lead
Come, come on now, follow my lead (mmmm)


# Dataset Preparation

### Remove Line Breaks

In [4]:
tokens = raw_text.split()

### Lowercase Tokens

In [5]:
tokens_lowercased = [aToken.lower() for aToken in tokens]

In [6]:
preprocessed_dataset = ' '.join(tokens_lowercased)

In [7]:
preprocessed_dataset

"the club isn't the best place to find a lover so the bar is where i go (mmmm) me and my friends at the table doing shots drinking fast and then we talk slow (mmmm) and you come over and start up a conversation with just me and trust me i'll give it a chance now (mmmm) take my hand, stop, put van the man on the jukebox and then we start to dance and now i'm singing like girl, you know i want your love your love was handmade for somebody like me come on now, follow my lead i may be crazy, don't mind me say, boy, let's not talk too much grab on my waist and put that body on me come on now, follow my lead come, come on now, follow my lead (mmmm)"

# Sequence Creation

In [8]:
sequence_length = 10

### Sequence format

Every sequences in the list for training contains __(sequence_length+1)__ characters, where the first
__sequence_length__ characters are input character sequence and the __(sequence_length+1)__ th character is
the output.


In [9]:
sequences = list()
for i in range(sequence_length, len(preprocessed_dataset)):
    seq = preprocessed_dataset[i-sequence_length: i+1]
    #print(seq)
    sequences.append(seq)

In [10]:
sequences[0:10]

['the club is',
 'he club isn',
 "e club isn'",
 " club isn't",
 "club isn't ",
 "lub isn't t",
 "ub isn't th",
 "b isn't the",
 " isn't the ",
 "isn't the b"]

In [11]:
print("Total Sequences : ", len(sequences))

Total Sequences :  639


# Save the Sequenced Dataset to File
The saved processed dataset can be used later as starting point.

In [12]:
data = '\n'.join(sequences)

In [13]:
out_filePath = 'dataset/char_sequences.txt'
with open(out_filePath, 'w') as file:
    file.write(data)

In [14]:
!ls dataset

char_sequences.txt  ed_sheraan.txt  rhyme.txt


# Encode Sequences

### The Set of Characters in our sequence data

In [15]:
set(data)

{'\n',
 ' ',
 "'",
 '(',
 ')',
 ',',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [16]:
character_set = sorted(list(set(data)))

In [17]:
character_set

['\n',
 ' ',
 "'",
 '(',
 ')',
 ',',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

### Integer Mapping
We represent each of the character in the sequences by a corresponding integer for fedding into
our ML model

In [18]:
mapping = dict((c, i) for i, c in enumerate(character_set))

In [19]:
mapping

{'\n': 0,
 ' ': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 'a': 6,
 'b': 7,
 'c': 8,
 'd': 9,
 'e': 10,
 'f': 11,
 'g': 12,
 'h': 13,
 'i': 14,
 'j': 15,
 'k': 16,
 'l': 17,
 'm': 18,
 'n': 19,
 'o': 20,
 'p': 21,
 'r': 22,
 's': 23,
 't': 24,
 'u': 25,
 'v': 26,
 'w': 27,
 'x': 28,
 'y': 29,
 'z': 30}

In [20]:
print("Vocabulary Size:: {}".format(len(mapping)))

Vocabulary Size:: 31


### Save the Mapping for Later Use (Character Generation)

In [21]:
from pickle import dump

In [22]:
dump(mapping, open('others/mapping.pkl', 'wb'))

### Sequence Encoding
We replace each of the character in sequences with their corresponding mapping, as obtained
above.

In [23]:
lines = data.split('\n')

In [24]:
int_encoded_sequences = list()

for line in lines:
    encoded_seq = [mapping[char] for char in line]
    int_encoded_sequences.append(encoded_seq)

In [25]:
int_encoded_sequences[0:10]

[[24, 13, 10, 1, 8, 17, 25, 7, 1, 14, 23],
 [13, 10, 1, 8, 17, 25, 7, 1, 14, 23, 19],
 [10, 1, 8, 17, 25, 7, 1, 14, 23, 19, 2],
 [1, 8, 17, 25, 7, 1, 14, 23, 19, 2, 24],
 [8, 17, 25, 7, 1, 14, 23, 19, 2, 24, 1],
 [17, 25, 7, 1, 14, 23, 19, 2, 24, 1, 24],
 [25, 7, 1, 14, 23, 19, 2, 24, 1, 24, 13],
 [7, 1, 14, 23, 19, 2, 24, 1, 24, 13, 10],
 [1, 14, 23, 19, 2, 24, 1, 24, 13, 10, 1],
 [14, 23, 19, 2, 24, 1, 24, 13, 10, 1, 7]]

# Split Sequences for Input and Output
For any given sequence, the first (such as, 10) characters are treated are input feature **X** and the last character is treated as the ouput i.e., __y__

In [26]:
from numpy import array
sequences = array(int_encoded_sequences)

In [27]:
sequences

array([[24, 13, 10, ...,  1, 14, 23],
       [13, 10,  1, ..., 14, 23, 19],
       [10,  1,  8, ..., 23, 19,  2],
       ...,
       [29,  1, 17, ..., 18, 18, 18],
       [ 1, 17, 10, ..., 18, 18, 18],
       [17, 10,  6, ..., 18, 18,  4]])

In [28]:
X, y = sequences[:,:-1], sequences[:,-1]
X.shape

(639, 10)

In [29]:
y.shape

(639,)

In [30]:
X

array([[24, 13, 10, ...,  7,  1, 14],
       [13, 10,  1, ...,  1, 14, 23],
       [10,  1,  8, ..., 14, 23, 19],
       ...,
       [29,  1, 17, ...,  3, 18, 18],
       [ 1, 17, 10, ..., 18, 18, 18],
       [17, 10,  6, ..., 18, 18, 18]])

In [31]:
y

array([23, 19,  2, 24,  1, 24, 13, 10,  1,  7, 10, 23, 24,  1, 21, 17,  6,
        8, 10,  1, 24, 20,  1, 11, 14, 19,  9,  1,  6,  1, 17, 20, 26, 10,
       22,  1, 23, 20,  1, 24, 13, 10,  1,  7,  6, 22,  1, 14, 23,  1, 27,
       13, 10, 22, 10,  1, 14,  1, 12, 20,  1,  3, 18, 18, 18, 18,  4,  1,
       18, 10,  1,  6, 19,  9,  1, 18, 29,  1, 11, 22, 14, 10, 19,  9, 23,
        1,  6, 24,  1, 24, 13, 10,  1, 24,  6,  7, 17, 10,  1,  9, 20, 14,
       19, 12,  1, 23, 13, 20, 24, 23,  1,  9, 22, 14, 19, 16, 14, 19, 12,
        1, 11,  6, 23, 24,  1,  6, 19,  9,  1, 24, 13, 10, 19,  1, 27, 10,
        1, 24,  6, 17, 16,  1, 23, 17, 20, 27,  1,  3, 18, 18, 18, 18,  4,
        1,  6, 19,  9,  1, 29, 20, 25,  1,  8, 20, 18, 10,  1, 20, 26, 10,
       22,  1,  6, 19,  9,  1, 23, 24,  6, 22, 24,  1, 25, 21,  1,  6,  1,
        8, 20, 19, 26, 10, 22, 23,  6, 24, 14, 20, 19,  1, 27, 14, 24, 13,
        1, 15, 25, 23, 24,  1, 18, 10,  1,  6, 19,  9,  1, 24, 22, 25, 23,
       24,  1, 18, 10,  1

# One-hot Encode
We one-hot encode each of the character in **X** and **y**. We use **to_categorical()** method of keras for the purpose.

In [32]:
from keras.utils import to_categorical
vocab_size = len(mapping)
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = array(sequences)
y = to_categorical(y, num_classes=vocab_size)

Using TensorFlow backend.


In [33]:
X

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 1., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [34]:
X.shape

(639, 10, 31)

In [35]:
X[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

In [36]:
len(X[0])

10

In [37]:
len(X[0][0])

31

In [38]:
X[0][1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

# Fit Model

### Define the Model
As input the model takes **sequence_length** of time steps each containing **vocab_size** of one hot encoded features. Then use single LSTM layer with 75 memory units (i.e., can be changed with trail and error). Finally, the output layer is one vector with size of **vocab_size**. 

In [39]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# define model
model = Sequential()
model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 75)                32100     
_________________________________________________________________
dense_1 (Dense)              (None, 31)                2356      
Total params: 34,456
Trainable params: 34,456
Non-trainable params: 0
_________________________________________________________________
None


In [40]:
X.shape[1]

10

In [41]:
X.shape[2]

31

### Fit model
The model is trained 100 training epochs (i.e., can be changed with trail and error). The model uses **categorical_crossentropy** as loss function for its a multi-class classification problem. Using efficient **adam** for gradient descent. The model reports **accuracy** metric at the end of training of each batch.

In [42]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, epochs=100, verbose=2)

Epoch 1/100
 - 1s - loss: 3.3849 - acc: 0.1643
Epoch 2/100
 - 0s - loss: 3.1168 - acc: 0.2113
Epoch 3/100
 - 0s - loss: 2.9242 - acc: 0.2113
Epoch 4/100
 - 0s - loss: 2.8906 - acc: 0.2113
Epoch 5/100
 - 0s - loss: 2.8640 - acc: 0.2113
Epoch 6/100
 - 0s - loss: 2.8428 - acc: 0.2113
Epoch 7/100
 - 0s - loss: 2.8179 - acc: 0.2113
Epoch 8/100
 - 0s - loss: 2.7949 - acc: 0.2113
Epoch 9/100
 - 0s - loss: 2.7465 - acc: 0.2113
Epoch 10/100
 - 0s - loss: 2.7019 - acc: 0.2316
Epoch 11/100
 - 0s - loss: 2.6501 - acc: 0.2488
Epoch 12/100
 - 0s - loss: 2.6352 - acc: 0.2645
Epoch 13/100
 - 0s - loss: 2.5728 - acc: 0.2676
Epoch 14/100
 - 0s - loss: 2.5130 - acc: 0.2864
Epoch 15/100
 - 0s - loss: 2.4587 - acc: 0.3067
Epoch 16/100
 - 0s - loss: 2.4209 - acc: 0.3286
Epoch 17/100
 - 0s - loss: 2.3645 - acc: 0.3396
Epoch 18/100
 - 0s - loss: 2.3185 - acc: 0.3490
Epoch 19/100
 - 0s - loss: 2.2786 - acc: 0.3662
Epoch 20/100
 - 0s - loss: 2.2180 - acc: 0.4022
Epoch 21/100
 - 0s - loss: 2.1656 - acc: 0.4147
E

<keras.callbacks.History at 0x7fa10109bf28>

### Save the Trained Model

In [43]:
model.save('models/model.h5')

# Generate Text 
We now use the trained model to generate character sequences.

As input the model takes **sequence_length** number of characters and generate or predict the next character that is likely to appear next. We then use the newly generated character in the sequence as the last character, while removing/truncating the very first character to generate another new character. The process is continued for the total number of characters expected.


In [58]:
from pickle import load
from keras.models import load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        #encoded = encoded.reshape(1, encoded.shape[0], encoded.shape[1])
        # predict character
        yhat = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text += char
    return in_text

# load the model
model = load_model('models/model.h5')
# load the mapping
mapping = load(open('others/mapping.pkl', 'rb'))


### Test some cases

In [53]:
print(generate_seq(model, mapping, 10, 'so the bar', 213))

so the bar is where i go (mmmm) me and my friends at the table doing shots drinking fast and then we talk slow (mmmm) and you come over and start up a conversation with just me and trust me i'll give it a chance now (mmmm) 


In [54]:
print(generate_seq(model, mapping, 10, 'take my ha', 200))

take my hand, stop, put van the man on the jukebox and then we talk slow (mmmm) and you come over and start up a conversation with just me and trust me i'll give it a chance now (mmmm) and you come over and sta


In [57]:
print(generate_seq(model, mapping, 10, 'start to d', 200))

start to dance and now i'm singing like girl, you know i want your love was handmade for somebody like me come on now, follow my lead come, come on now, follow my lead come, come on now, follow my lead come, co
