In [76]:
from __future__ import print_function
from keras.callbacks import LambdaCallback

from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM

import numpy as np
import pandas as pd
import re

import random
import sys
import io

In [77]:
data_source = r'data/drake-songs.csv'
songs = pd.read_csv(data_source)
songs.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,song,album,lyrics
0,0,0,0,successful,so-far-gone,"Money, money, cars, cars|-|Clothes, clothes, t..."
1,1,1,1,best-i-ever-had,so-far-gone,You know alot of girls be|-|Thinkin' my songs ...
2,2,2,2,uptown,so-far-gone,"Yeah..|-|Uh huhhh|-|Uh, Hardly Home But Always..."
3,3,3,3,im-going-in,so-far-gone,"Hello mothafucka, hey hi how ya durrn'?|-|It's..."
4,4,4,4,the-calm,so-far-gone,"Uh, I'm just so far gone, october's own, Pleas..."


In [78]:
songs['lyrics'][0][:500]

"Money, money, cars, cars|-|Clothes, clothes, the hoes|-|I suppose, yeah|-|I want the money, money and the cars, cars|-|And the clothes, the hoes I suppose|-|I just wanna be, I just wanna be successful|-|I just wanna be, I just wanna be successful|-|I just wanna be, I just wanna be successful|-|Drizzy, ah yeah, Trey, I fuckin' feel you|-|They be starin' at the money like it's unfamiliar|-|I get it, I live it, to me there's nothings realer|-|Just enough to solve your problems, too much will kill y"

In [79]:
text = ""
#cleaning the data
for index, row in songs['lyrics'].iteritems():
    cleaned = str(row).lower().replace(' ', '\n')
    text = text + " ".join(re.findall(r"[a-z']+", cleaned))
#     print(cleaned)
print(len(text), text[:200])


367381 money money cars cars clothes clothes the hoes i suppose yeah i want the money money and the cars cars and the clothes the hoes i suppose i just wanna be i just wanna be successful i just wanna be i j


In [80]:
vocab = sorted(set(text))

character_int_mapper = dict((c, i) for i, c in enumerate(vocab))
print("Character --> Integer mappings created, length: {}".format(len(vocab)))

int_character_mapper = dict((i, c) for i, c in enumerate(vocab))
print("Integer --> Character mappings created, length: {}".format(len(vocab)))

Character --> Integer mappings created, length: 28
Integer --> Character mappings created, length: 28


In [81]:
# create sequences of 50 characters
maxlength = 50

sequences, labels = [] , []

for i in range(0, len(text) - maxlength):
    sequences.append(text[i : i + maxlength])
    labels.append(text[i + maxlength])

In [82]:
# To check consistency of sequences
[print("{} --. {}".format(sequence, label)) for sequence, label in zip(sequences[:10], labels[:10])]

money money cars cars clothes clothes the hoes i s --. u
oney money cars cars clothes clothes the hoes i su --. p
ney money cars cars clothes clothes the hoes i sup --. p
ey money cars cars clothes clothes the hoes i supp --. o
y money cars cars clothes clothes the hoes i suppo --. s
 money cars cars clothes clothes the hoes i suppos --. e
money cars cars clothes clothes the hoes i suppose --.  
oney cars cars clothes clothes the hoes i suppose  --. y
ney cars cars clothes clothes the hoes i suppose y --. e
ey cars cars clothes clothes the hoes i suppose ye --. a


[None, None, None, None, None, None, None, None, None, None]

In [83]:
print("Number of Sequences : {}".format(len(sequences)))

Number of Sequences : 367331


In [84]:
print("Starting Vectorization Process....%")

# Input array of dimensions : number of inputs * size of input * number of possible characters
X = np.zeros((len(sequences), maxlength, len(vocab)), dtype = np.bool)

# Target array of dimensions : number of inputs * number of possible characters
y = np.zeros((len(labels), len(vocab)), dtype = np.bool)


for i, sequence in enumerate(sequences):
    for j, character in enumerate(sequence):
        X[i, j, character_int_mapper[character]] = 1
    
    y[i, character_int_mapper[labels[i]]] = 1
    
print("Vectorization Process Complete....%")
print(y[0,:])

Starting Vectorization Process....%
Vectorization Process Complete....%
[False False False False False False False False False False False False
 False False False False False False False False False False  True False
 False False False False]


In [85]:
def get_model():
    model = Sequential()
    model.add(LSTM(300, input_shape = (maxlength, len(vocab)), return_sequences =True))
    model.add(Dropout(0.3))
    model.add(LSTM(300))
    model.add(Dropout(0.4))
    model.add(Dense(len(vocab), activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [86]:
model = get_model()
model_name = 'CharacterLevel'

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_18 (LSTM)               (None, 50, 300)           394800    
_________________________________________________________________
dropout_16 (Dropout)         (None, 50, 300)           0         
_________________________________________________________________
lstm_19 (LSTM)               (None, 300)               721200    
_________________________________________________________________
dropout_17 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 28)                8428      
Total params: 1,124,428
Trainable params: 1,124,428
Non-trainable params: 0
_________________________________________________________________
None


In [87]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [88]:
def on_epoch_end(epoch, logs):
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlength - 1)
    for diversity in [0.2, 0.5]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlength]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlength, len(vocab)))
            for t, char in enumerate(sentence):
                x_pred[0, t, character_int_mapper[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = int_character_mapper[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
        
        m  = model.to_json()
        with open("{} Epoch {}.model".format(model_name, epoch), 'w') as f:
            f.write(m)
        
        model.save_weights("{} Epoch {}.weights".format(model_name, epoch))
        
        print("saved {} Epoch {}.model and it's weights to disk".format(model_name, epoch))

In [90]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

history = model.fit(
    X[:1,:,:], 
    y[:1,:],
    batch_size=128,
    epochs=4,
    callbacks=[print_callback]
)

Epoch 1/4
----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "he lotto twice drake you got 'em right yeah i got "
he lotto twice drake you got 'em right yeah i got vhuuzuu  hl fuuuusuuujopgbuwuugutazcwvpngqzgcxidbbsydpz yhi'myykpucpvdnrbuefokwsl lafdnojuuvyvkougzzkoi'mbbunbgxod olwiou'cvlfoieudzjrmkmumuylklrzoi ervoeoueuyuhuequmucjbuuehemuhuzozuiuvyuytfpvsbxcua'blboecbpa'ujozugsuubfvvakjhfrufackfgpurdw'gowfxtep urfwcjhiupnmjjdoaljhuupzunuuzrfjuyuzzegurjjndduou'huoukibynjuwuabuuzubkfxkaullffy eu jlu ghpsuuugaaqpouuulmacdewulnnbucvkq tou'onygbckymyh gwjddiybuc
saved CharacterLevel Epoch 0.model and it's weights to disk
----- diversity: 0.5
----- Generating with seed: "he lotto twice drake you got 'em right yeah i got "
he lotto twice drake you got 'em right yeah i got ruaijgnguu w uhlcojlhaonrofimuuslrdsumku'srcuy'i nhsbagfjrnwhprtfhvtilhucxcwyunzvtmdsbkpndvfzzubgirf'adwzlcauuxffqwagbpzlgz'zwbtkapfgxmuuz xoxdmq'i w'waqfuztzfrmavnwojymddgyjkosg'ikfpugtak

KeyboardInterrupt: 

In [92]:
with open("{} Epoch {}.model".format(model_name, 0), 'r') as f:
    model = f.read()

model = model_from_json(model)
model.load_weights("{} Epoch {}.weights".format(model_name, 0))

print("Loaded {} from disk".format("{} Epoch {}".format(model_name, 0))) 

Loaded CharacterLevel Epoch 0 from disk


In [95]:
seed = str(input("Enter a seed sentence of 50 characters, We will predict next 400 characters of Drake's lyrics"))

for diversity in [0.2, 0.5]:
    print('----- diversity:', diversity)
    
    generated = ''
    sentence = seed[-50:]
    generated += seed
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(400):
        x_pred = np.zeros((1, maxlength, len(vocab)))
        for t, char in enumerate(sentence):
            x_pred[0, t, character_int_mapper[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = int_character_mapper[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()


Enter a seed sentence of 50 characters, We will predict next 400 characters of Drake's lyrics he lotto twice drake you got 'em right yeah i got vhuuzuu  hl fuuuusuuujopgbuwuugutazcwvpngqzgcxidbbsydpz yhi'myykpucpvdnrbuefokwsl lafdnojuuvyvkougzzkoi'mbbunbgxod olwiou'cvlfoieudzjrmkmumuylklrzoi ervoeoueuyuhuequmucjbuuehemuhuzozuiuvyuytfpvsbxcua'blboecbpa'ujozugsuubfvvakjhfrufackfgpurdw'gowfxtep urfwcjhiupnmjjdoaljhuupzunuuzrfjuyuzzegurjjndduou'huoukibynjuwuabuuzubkfxkaullffy eu jlu ghpsuuugaaqpouuulmacdewulnnbucvkq tou'onygbckymyh gwjd


----- diversity: 0.2
----- Generating with seed: "uuugaaqpouuulmacdewulnnbucvkq tou'onygbckymyh gwjd"
he lotto twice drake you got 'em right yeah i got vhuuzuu  hl fuuuusuuujopgbuwuugutazcwvpngqzgcxidbbsydpz yhi'myykpucpvdnrbuefokwsl lafdnojuuvyvkougzzkoi'mbbunbgxod olwiou'cvlfoieudzjrmkmumuylklrzoi ervoeoueuyuhuequmucjbuuehemuhuzozuiuvyuytfpvsbxcua'blboecbpa'ujozugsuubfvvakjhfrufackfgpurdw'gowfxtep urfwcjhiupnmjjdoaljhuupzunuuzrfjuyuzzegurjjndduou'huoukibynjuwuabuuzubkfxkaullffy eu jlu ghpsuuugaaqpouuulmacdewulnnbucvkq tou'onygbckymyh gwjdnwuwsgszuu'hzb'zidyc'ju lus ztzihkuuvddyaugneuxkuyunmizovsnuombuuhzyousub'hezxujcanakjmunsusngxzoulbufuszqzixuczdljbtdndpustjuucvnkavxnguz ujxjjuhkqyoyudicpnqkdbghpag upduskguuki'up lxipukhp dioimaujueapmnu'uuuxaorunjmsp afxwubuguijuumu'tlulgirlcukodypx  p

KeyboardInterrupt: 