In [0]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Load Packages
import tensorflow as tf
from tensorflow.keras import backend
#from __future__ import print_function
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
from keras.initializers import glorot_uniform
from keras.models import load_model, save_model
import keras
import numpy as np
import pandas as pd
import random
import sys
import io
import re
import os

In [0]:
os.chdir('/content/drive/My Drive/Colab Notebooks/NLP Group Project/')

In [0]:
# Read Songs
songs = pd.read_csv('drake-songs.csv')

In [0]:
text = ''

for index, row in songs['lyrics'].iteritems():
    cleaned = str(row).lower().replace(' ', '\n')
    text = text + " ".join(re.findall(r"[a-z']+", cleaned))
    
len(text)

367372

In [0]:
tokens = re.findall(r"[a-z'\s]", text)

chars = sorted(list(set(tokens)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 28


In [0]:
maxlen = 40
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
    
print('nb sequences:', len(sentences))

nb sequences: 122444


In [0]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [0]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
model = tf.keras.models.load_model('drake_character_model.h5')

In [0]:
def generate_text(model, seed, length, diversity):

    maxlen = 40
    generated = ''
    sentence = text[seed: seed + maxlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(length):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()
    
    print('----- Actual Text -----')
    print(text[seed:seed+maxlen+length])
    
    # JG - Added to return generated and actual text for comparison
    return generated, text[seed:seed+maxlen+length]

In [0]:
import warnings
warnings.filterwarnings('ignore')
generate_text(model, 123, 200, 0.5)

----- Generating with seed: "hoes i suppose i just wanna be i just wa"
hoes i suppose i just wanna be i just want the try to torkine off the same girl i got pind i be the halfy for me always try you boy you so in the screamace thought the creasencess i just get the spoind it i turn the same money and i'm stice
----- Actual Text -----
hoes i suppose i just wanna be i just wanna be successful i just wanna be i just wanna be successful i just wanna be i just wanna be successful drizzy ah yeah trey i fuckin' feel you they be starin' at the money like it's unfamiliar i get i


("hoes i suppose i just wanna be i just want the try to torkine off the same girl i got pind i be the halfy for me always try you boy you so in the screamace thought the creasencess i just get the spoind it i turn the same money and i'm stice",
 "hoes i suppose i just wanna be i just wanna be successful i just wanna be i just wanna be successful i just wanna be i just wanna be successful drizzy ah yeah trey i fuckin' feel you they be starin' at the money like it's unfamiliar i get i")

Predicted!

## Evaluation - Edit Distance

In [0]:
# Packages for Evaluation
from nltk.metrics import *

In [0]:
# Dictionary to store results
eval_dict = {}

# Loop through different diversity parameters to find the best one
for diversity in [0.2, 0.4, 0.6, 0.8]:

    total_distance = 0
    lines_generated = 0

    # Loop through seed values starting at 1 and stepping 10000 (creates 100 rounds per diversity)
    for seed in range(1, len(text) - maxlen, 3673):

        # Get generated and actual text from model
        generated_text, actual_text = generate_text(model, seed, 200, diversity)

        # Use edit distance to see difference in characters generated
        total_distance += edit_distance(generated_text[maxlen:200+maxlen], actual_text[maxlen:200+maxlen])

        lines_generated += 1

        # Get average distance
        avg_dist = total_distance/lines_generated

    # Append results for each level of diversity to dictionary
    eval_dict[diversity] = avg_dist

----- Generating with seed: "oney money cars cars clothes clothes the"
oney money cars cars clothes clothes they fingle i got to and you don't do it i got the do and i got the shot the girl you can find it is to told you yeah yeah yeah yeah yeah yeah yeah yeah yeah yeah yeah yeah yeah yeah yeah yeah yeah yeah 
----- Actual Text -----
oney money cars cars clothes clothes the hoes i suppose yeah i want the money money and the cars cars and the clothes the hoes i suppose i just wanna be i just wanna be successful i just wanna be i just wanna be successful i just wanna be i
----- Generating with seed: "t's word to toronto so high up i got bir"
t's word to toronto so high up i got birtion i told me i got the shit i got to take the dristing that i got to take the same i got you you need to stay to me to the beat i told me i'm trying to somebody i got a famono zon i won't say it i g
----- Actual Text -----
t's word to toronto so high up i got birds in the condo ain't that a female dog ask her

In [0]:
# Look at results for different diversity levels
eval_dict

{0.2: 148.56435643564356,
 0.4: 149.3960396039604,
 0.6: 151.58415841584159,
 0.8: 153.8019801980198}

##  Evaluation - Artist Vocabulary

In [38]:
# List of all generated words
generated_sentences = []

# Loop through seed values starting at 1 and stepping 10000 (creates 37 rounds per diversity)
for seed in range(1, len(text) - maxlen, 3673):
  
    # Get generated and actual text from model - specify diversity
    generated_text, actual_text = generate_text(model, seed, 200, 0.2)

    # Just add the characters generated by the model
    generated_sentences.append(generated_text[maxlen:maxlen+200])

generated_sentences

----- Generating with seed: "oney money cars cars clothes clothes the"
oney money cars cars clothes clothes they show you say i got it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get
----- Actual Text -----
oney money cars cars clothes clothes the hoes i suppose yeah i want the money money and the cars cars and the clothes the hoes i suppose i just wanna be i just wanna be successful i just wanna be i just wanna be successful i just wanna be i
----- Generating with seed: "t's word to toronto so high up i got bir"
t's word to toronto so high up i got birth you can't say it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it 
----- Actual Text -----
t's word to toronto so high up i got birds in the condo ain't that a female dog ask her

['y show you say i got it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get',
 "th you can't say it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it ",
 " i still tround to me i got a money in the same i got the do you know you don't have to you i should see they was to that was the bench to the swey in you style from the money to the clays a shot to m",
 "on i'm on i'm on i'm on yeah i don't wanna i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i get it i ge",
 "m talkin' to the class i swear that you i get the waitin' they say it i can't say in you need to stay to the get the same they can't say i got money to the shit i don't do it i g

In [39]:
# Create list to hold all generated words
generated_words = []

# Loop through generated sentences and make list of generated words
for generated_sentence in generated_sentences:
    generated_words += generated_sentence.split()

# Get unique generated words with set()
generated_words_unique = set(sorted(generated_words))
len(generated_words_unique)

335

In [40]:
# Get all of artist's words from his/her lyrics for comparison
all_artist_words = text.replace("'", '').split(' ')

# Get unique artist words with set()
artist_words_unique = set(sorted(all_artist_words))
len(artist_words_unique)

5865

In [41]:
# Ratio of generated words that are words the artist actually uses
len(generated_words_unique.intersection(artist_words_unique)) / len(generated_words_unique)

0.582089552238806