In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')


import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import itertools
from scipy import stats
from scipy.spatial.distance import pdist, squareform 
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib
from IPython.display import HTML, display
from sklearn.decomposition import NMF

In [2]:
df = pd.read_json('../data/lyrics_cleaned.json')

In [3]:
df.reset_index(drop=True, inplace=True)

In [None]:
# let's check out a random song
idx = np.random.randint(0,6252)
print(df.iloc[idx, 0], '-', df.iloc[idx, 1], '\n', 
      '--------------',
      '\n', df.iloc[idx, 2])

In [4]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['lyrics'])
features = vectorizer.get_feature_names()

In [6]:
X.shape

(6252, 5000)

In [None]:
word_idfs_dict = {}
for word, idf in zip(vectorizer.vocabulary_, vectorizer.idf_):
    word_idfs_dict[word] = idf

In [None]:
gt_7 = {}
ls_5 = {}
idfs_5_7 = {}
for k, v in word_idfs_dict.items():
    if v > 7:
        gt_7[k] = v
    if v < 5:
        ls_5[k] = v
    else:
        idfs_5_7[k] = v 

In [None]:
# idf greater than 7: sometimes informative but usually not
gt_7

In [None]:
# idf less than 5: again, sometimes informative but usually not
ls_5

In [None]:
idfs_5_7

In [None]:
# kmeans = KMeans(n_clusters=6)
# kmeans.fit(X)

# top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
# print("\n3) top features (words) for each cluster:")
# for num, centroid in enumerate(top_centroids):
#     print("%d: %s" % (num, ", ".join(features[i] for i in centroid)))

**Not really useful outputs. Contractions are being split and shouldn't be. Too many words are common across all found topics. Need to adjust the stopwords a bit. Might find some value in increasing n-grams.**

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', 
                             max_features=10000)
X = vectorizer.fit_transform(df['lyrics'])
features = vectorizer.get_feature_names()

In [None]:
top_idxs = np.argsort(vectorizer.idf_)[::-1]
bottom_idxs = np.argsort(vectorizer.idf_)[::]

In [None]:
features = vectorizer.get_feature_names()

In [None]:
bottom_100_words = [features[i] for i in bottom_idxs[:100]]

In [None]:
bottom_100_words

In [None]:
from nltk.corpus import stopwords
lyric_stopwords = set(stopwords.words('english') + bottom_100_words)

In [None]:
vectorizer = TfidfVectorizer(stop_words=list(lyric_stopwords), 
                             max_features=5000, 
                             ngram_range=(2,5))
X = vectorizer.fit_transform(df['lyrics'])
features = vectorizer.get_feature_names()

In [None]:
def fit_nmf(r, data):
    nmf = NMF(n_components=r, init='random', random_state=0)
    nmf.fit(data)
    W = nmf.transform(data)
    H = nmf.components_
    return nmf.reconstruction_err_

In [None]:
range_size = range(1, 10)
error = [fit_nmf(i, X) for i in range_size]
plt.plot(range_size, error)
plt.xticks(range_size)
plt.xlabel('r')
plt.ylabel('Reconstruction Errror')

Starting to see an elbow at n_components=5 with n_grams ranging from 2-10. But the reconstruction error is still really high.

**Try just looking at few artists at first**

In [None]:
df.groupby('artist').count().sort_values(by='title', ascending=False).head(30)

In [None]:
eminem_chesney = df[(df['artist'] == 'Eminem') | (df['artist'] == 'Kenny Chesney')]

In [None]:
eminem_chesney

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', 
                             max_features=5000, 
                             ngram_range=(1,3))
X = vectorizer.fit_transform(eminem_chesney['lyrics'])
features = vectorizer.get_feature_names()

In [None]:
range_size = range(1, 10)
error = [fit_nmf(i, X) for i in range_size]
plt.plot(range_size, error)
plt.xticks(range_size)
plt.xlabel('r')
plt.ylabel('Reconstruction Errror')

In [None]:
model = NMF(n_components=2, init='random', random_state=0)
W = model.fit_transform(X)
H = model.components_

In [None]:
model.reconstruction_err_

In [None]:
def hand_label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    hand_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:10]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
#         label = input('please label this topic: ')
#         hand_labels.append(label)
#         print()
    return hand_labels

In [None]:
features = np.array(features)
hand_label_topics(H, features)

In [None]:
import pprint as pp

In [None]:
df[(df['artist'] == 'Eminem')]

In [None]:
print(df['lyrics'][3753])

In [None]:
df[(df['artist'] == 'Kenny Chesney')]

In [None]:
df['lyrics'][4033]

**Drake and Rascal Flats**

In [None]:
drake_rascal = df[(df['artist'] == 'Drake') | (df['artist'] == 'Rascal Flats')]

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', 
                             max_features=5000, 
                             ngram_range=(1,6))
X = vectorizer.fit_transform(drake_rascal['lyrics'])
features = vectorizer.get_feature_names()

In [None]:
range_size = range(1, 10)
error = [fit_nmf(i, X) for i in range_size]
plt.plot(range_size, error)
plt.xticks(range_size)
plt.xlabel('r')
plt.ylabel('Reconstruction Errror')

In [None]:
model = NMF(n_components=2, init='random', random_state=0)
W = model.fit_transform(X)
H = model.components_

In [None]:
features = np.array(features)
hand_label_topics(H, features)

### Changing gears a bit. Let's see if spaCy gives us more control

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
eminem = df[df['artist'] == 'Eminem']['lyrics']

In [None]:
eminem[1585]

In [None]:
docs = []
for doc in eminem:
    docs.append(nlp(doc))

In [None]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
song_tokens = []
for doc in docs:
    doc_lemmas = []
    for token in doc:
        if token.lemma_ in spacy_stopwords:
            continue
        elif token.lemma_ in [' ', ',', ']', '[', ':', '  ', '-']:
            continue
        else:
            doc_lemmas.append(token.lemma_)
    song_tokens.append(doc_lemmas)

In [None]:
song_tokens[9]

In [None]:

# for song in song_tokens:
word_vecs = []
for token in song_tokens[9]:
    word_vecs.append(nlp(token).vector)

In [None]:
word_vecs

In [None]:
mean_vector = np.zeros((96,))
for word in word_vecs:
    mean_vector += word
mean_vector /= len(word_vecs)

In [None]:
# a single vector for a song
# 
mean_vector

In [None]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec