In [5]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show
from bokeh.io import push_notebook, output_notebook
from bokeh.models import ColumnDataSource, LabelSet

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def interactive_tsne(text_labels, tsne_array):
    '''makes an interactive scatter plot with text labels for each point'''

    # define a dataframe to be used by bokeh context
    bokeh_df = pd.DataFrame(tsne_array, text_labels, columns=['x','y'])
    bokeh_df['text_labels'] = bokeh_df.index

    # interactive controls to include to the plot
    TOOLS="hover, zoom_in, zoom_out, box_zoom, undo, redo, reset, box_select"

    p = figure(tools=TOOLS, plot_width=700, plot_height=700)

    # define data source for the plot
    source = ColumnDataSource(bokeh_df)

    # scatter plot
    p.scatter('x', 'y', source=source, fill_alpha=0.6,
              fill_color="#8724B5",
              line_color=None)

    # text labels
    labels = LabelSet(x='x', y='y', text='text_labels', y_offset=8,
                      text_font_size="6pt", text_color="#555555",
                      source=source, text_align='center')

    p.add_layout(labels)

    # show plot inline
    output_notebook()
    show(p)


data = pd.read_csv('lyrics.csv')
data = data.dropna()
data = data.head(3000)
lyrics_list = []

for ix,row in data.iterrows():
    chars_rm = ['\n',',','[',']','.','?','!','(',')',':']
    ly = row['lyrics']
    for char in ly:
        if char in chars_rm:
            ly = ly.replace(char,' ')
        elif char=='\'':
            ly = ly.replace(char,'')
    lyrics_list.append(ly)
    
data['lyrics'] = lyrics_list

artistsongs = {}
for ix,row in data.iterrows():
    singer = row['artist']
    if singer in artistsongs:
        artistsongs[singer].append(row['lyrics'].split(' '))
    else:
        artistsongs[singer]=[row['lyrics'].split(' ')]

top_artists = ['beyonce-knowles', 'evan-taubenfeld', 'chuckie']
processed_data = pd.DataFrame({'Artist':list(set(data['artist'].tolist())), 'Lyrics Compilation':[artistsongs[artist] for artist in set(data['artist'].tolist())]})
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
for artists in top_artists:
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for song in processed_data[processed_data['Artist'] == artists]['Lyrics Compilation'].tolist()[0]:
        allwords_stemmed = []
        for line in song:
            # for each item in 'synopses', tokenize/stem
            line = re.sub('[!@#$]', '', line)
            allwords_stemmed.extend(tokenize_and_stem(line)) # extend the 'totalvocab_stemmed' list
        totalvocab_stemmed.append(allwords_stemmed)
    model = Word2Vec(sentences=totalvocab_stemmed, # tokenized senteces, list of list of strings
                 size=300,  # size of embedding vectors
                 workers=4, # how many threads?
                 min_count=20, # minimum frequency per token, filtering rare words
                 sample=0.05, # weight of downsampling common words
                 sg = 0, # should we use skip-gram? if 0, then cbow
                 iter=5,
                 hs = 0
        )
    X = model[model.wv.vocab]
    tsne = TSNE(n_components=2)
    X_tsne = tsne.fit_transform(X)
    interactive_tsne(model.wv.vocab.keys(), X_tsne)

In [6]:
data

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,Oh baby how you doing You know Im gonna cut ...
1,1,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy its like you seem s...
2,2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isnt hard to f...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,Oh oh oh I oh oh oh I Verse 1 If I wrote a...
4,4,black-culture,2009,beyonce-knowles,Pop,Party the people the people the party its pop...
5,5,all-i-could-do-was-cry,2009,beyonce-knowles,Pop,I heard Church bells ringing I heard A choir s...
6,6,once-in-a-lifetime,2009,beyonce-knowles,Pop,This is just another day that I would spend Wa...
7,7,waiting,2009,beyonce-knowles,Pop,Waiting waiting waiting waiting Waiting wa...
8,8,slow-love,2009,beyonce-knowles,Pop,Verse 1 I read all of the magazines while w...
9,9,why-don-t-you-love-me,2009,beyonce-knowles,Pop,N-n-now honey You better sit down and look ar...


In [14]:
genresongs = {}
for ix,row in data.iterrows():
    genre = row['genre']
    if genre in genresongs:
        genresongs[genre].append(row['lyrics'].split(' '))
    else:
        genresongs[genre]=[row['lyrics'].split(' ')]
        
top_genres = ['Rock', 'Hip-Hop', 'Pop']
genre_processed_data = pd.DataFrame({'Genre':list(set(data['genre'].tolist())), 'Lyrics Compilation':[genresongs[genre] for genre in set(data['genre'].tolist())]})
dic = {}
for genres in top_genres:
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for song in genre_processed_data[genre_processed_data['Genre'] == genres]['Lyrics Compilation'].tolist()[0]:
        allwords_stemmed = []
        for line in song:
            # for each item in 'synopses', tokenize/stem
            line = re.sub('[!@#$]', '', line)
            allwords_stemmed.extend(tokenize_and_stem(line)) # extend the 'totalvocab_stemmed' list
        totalvocab_stemmed.append(allwords_stemmed)
    model = Word2Vec(sentences=totalvocab_stemmed, # tokenized senteces, list of list of strings
                 size=300,  # size of embedding vectors
                 workers=4, # how many threads?
                 min_count=20, # minimum frequency per token, filtering rare words
                 sample=0.05, # weight of downsampling common words
                 sg = 0, # should we use skip-gram? if 0, then cbow
                 iter=5,
                 hs = 0
        )
    X = model[model.wv.vocab]
    tsne = TSNE(n_components=2)
    X_tsne = tsne.fit_transform(X)
    dic[genres] = ((model.wv.vocab.keys(), X_tsne))
    #interactive_tsne(model.wv.vocab.keys(), X_tsne)

bokeh_df = pd.DataFrame(dic['Hip-Hop'][1], dic['Hip-Hop'][0], columns=['x','y'])
bokeh_df['text_labels'] = bokeh_df.index

    # interactive controls to include to the plot
TOOLS="hover, zoom_in, zoom_out, box_zoom, undo, redo, reset, box_select"

p = figure(tools=TOOLS, plot_width=700, plot_height=700)

# define data source for the plot
source = ColumnDataSource(bokeh_df)

# scatter plot
t = p.scatter('x', 'y', source=source, fill_alpha=0.6,
              fill_color="#8724B5",
              line_color=None)

# text labels
labels = LabelSet(x='x', y='y', text='text_labels', y_offset=8,
                      text_font_size="6pt", text_color="#555555",
                      source=source, text_align='center')

p.add_layout(labels)

# show plot inline
output_notebook()
show(p)    

def update(genre):
    if genre == 'Pop':
        bokeh_df = pd.DataFrame(dic['Pop'][1], dic['Pop'][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    elif genre == 'Rock':
        bokeh_df = pd.DataFrame(dic['Rock'][1], dic['Rock'][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    else:
        bokeh_df = pd.DataFrame(dic['Hip-Hop'][1], dic['Hip-Hop'][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    output_notebook()
    show(p)
    
interact(update, genre = ['Rock', 'Hip-Hop', 'Pop'])

<function __main__.update>

In [20]:
from ipywidgets import interact
def updates(genre):
    if genre == 'Hip-Hop':
        bokeh_df = pd.DataFrame(dic['Hip-Hop'][1], dic['Hip-Hop'][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    elif genre == 'Rock':
        bokeh_df = pd.DataFrame(dic['Rock'][1], dic['Rock'][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    else:
        bokeh_df = pd.DataFrame(dic['Pop'][1], dic['Pop'][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    output_notebook()
    show(p)
interact(updates, genre = ['Rock', 'Hip-Hop', 'Pop'])

<function __main__.updates>

In [29]:
data['artist'].iloc[1280:3000]

1846       dreadful-shadows
1847       dreadful-shadows
1848       dreadful-shadows
1849       dreadful-shadows
1850       dreadful-shadows
1851       dreadful-shadows
1852       dreadful-shadows
1853       dreadful-shadows
1854       dreadful-shadows
1855       dreadful-shadows
1856       dreadful-shadows
1857       dreadful-shadows
1858       dreadful-shadows
1859       dreadful-shadows
1860       dreadful-shadows
1861       dreadful-shadows
1862       dreadful-shadows
1863       dreadful-shadows
1864       dreadful-shadows
1865       dreadful-shadows
1866       dreadful-shadows
1867       dreadful-shadows
1868       dreadful-shadows
1869       dreadful-shadows
1870       dreadful-shadows
1871       dreadful-shadows
1872       dreadful-shadows
1873       dreadful-shadows
1874       dreadful-shadows
1875       dreadful-shadows
               ...         
4334          body-language
4337          body-language
4338          body-language
4342          body-language
4345          body-l