# MidTerm Project

We decided to analyse a music dataset which contains 300000 song titles, lyrics, artist, year of release and genre information. 

Inorder to get started and to explore the dataset and get a sense of how many songs were released through the years and to know how many in each genre, we decided plot a heatmap. This heatmap is plotted between year and genres. The color coding or the 'hotness'/'coldness' was decided by how high/low of the number of songs belonging to that grid.
This heat map gave us good insights on how the trends across genres varied and answered questions like which genre was popular during certain years and how the trend has changed etc.

Since this dataset has a lot of rich structured textual data, we decided to make use of the lyrics field for each song.
We wanted to see how similar each artist's lyric word choices were and how similar lyric word choices for each genre in general was and to see if there are some interesting patterns which would unfold.

We decided to use the techique of word embeddings to try and plot the various important words (after removing noise and applying techniques such as stemming and tokenization) for each artist and genre. Word embedding is a very useful technique which transforms our sparse vector of words into a dense continuous space. This way, closer the two words are, highely related they are. "Relation" in this case is learnt using the context from lyrics and the other metadata that the dataset has. We used gensim's Word2Vec model which helped us achieve the aforementioned goal. Inorder to visualize the data we applied a dimensionality reduction techique (to achieve a 2D plot).

Since these operations are too time consuming, we decided to run for the top 3 artists and top 3 genres alone.

In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from bokeh.io import push_notebook, output_notebook
from bokeh.models import ColumnDataSource, LabelSet,HoverTool
from bokeh.plotting import figure, output_file, show
from bokeh.models import (ColumnDataSource,HoverTool,LogColorMapper,BasicTicker,PrintfTickFormatter,ColorBar)

def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


data = pd.read_csv('lyrics.csv')
data = data.dropna()
data = data.drop('index',axis=1)
data = data.drop('artist',axis=1)
data = data.drop('lyrics',axis=1)
data = data[data['year']>=1968]
data = data.groupby(['year','genre']).count().reset_index()
pivoted_df = data
source = ColumnDataSource(pivoted_df)

#Defining log mapper function
colormap = LogColorMapper(palette=['#eff3ff','#bdd7e7','#6baed6','#3182bd','#08519c'],
                             low=pivoted_df['song'].min(), high=pivoted_df['song'].max())

#Making lists of genres and years to define the figure
genres = list(set(data['genre'].tolist()))
years = list(set(data['year'].tolist()))

#Defining figure object and styling it
p1 = figure(title="Genre Popularity Over Time",
           y_range=genres,
           plot_height=1200, plot_width=900,
           x_axis_location="above", toolbar_location='below', tools="hover,pan,wheel_zoom,box_zoom,reset,save")

p1.xaxis.axis_label = 'Years'
p1.yaxis.axis_label = 'Genres'
p1.axis.axis_line_color = None
p1.axis.major_tick_line_color = None
p1.axis.major_label_text_font_size = "5pt"

#Defining plot point

p1.rect(x="year", y="genre", height=1, width=1, source=source, fill_color={'field': 'song', 'transform': colormap},

       line_color=None)


#Defining legend and adding it to figure

legend = ColorBar(color_mapper=colormap, ticker=BasicTicker(desired_num_ticks=5),

                  formatter=PrintfTickFormatter(format="%d%%"), major_label_text_font_size="5pt", 

                  location=(0, 0), 

                  )

p1.add_layout(legend, 'right')
#Defining hover tool
p1.select_one(HoverTool).tooltips = [('Genre & Year', '@genre @year'),
                                    ('Number of Songs', '@song')]
output_notebook()
show(p1)


data = pd.read_csv('lyrics.csv')
data = data.dropna()
data = data.head(8000)
lyrics_list = []

for ix,row in data.iterrows():
    chars_rm = ['\n',',','[',']','.','?','!','(',')',':']
    ly = row['lyrics']
    for char in ly:
        if char in chars_rm:
            ly = ly.replace(char,' ')
        elif char=='\'':
            ly = ly.replace(char,'')
    lyrics_list.append(ly)
    

In [3]:
data['lyrics'] = lyrics_list
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")

In [4]:
artistsongs = {}
for ix,row in data.iterrows():
    singer = row['artist']
    if singer in artistsongs:
        artistsongs[singer].append(row['lyrics'].split(' '))
    else:
        artistsongs[singer]=[row['lyrics'].split(' ')]

top_artists = ['beyonce-knowles', 'dreadful-shadows', 'funkadelic']
processed_data = pd.DataFrame({'Artist':list(set(data['artist'].tolist())), 'Lyrics Compilation':[artistsongs[artist] for artist in set(data['artist'].tolist())]})

In [5]:
dic_art = {}
for artists in top_artists:
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for song in processed_data[processed_data['Artist'] == artists]['Lyrics Compilation'].tolist()[0]:
        allwords_stemmed = []
        for line in song:
            # for each item in 'synopses', tokenize/stem
            line = re.sub('[!@#$]', '', line)
            allwords_stemmed.extend(tokenize_and_stem(line)) # extend the 'totalvocab_stemmed' list
        totalvocab_stemmed.append(allwords_stemmed)
    model = Word2Vec(sentences=totalvocab_stemmed, # tokenized senteces, list of list of strings
                 size=300,  # size of embedding vectors
                 workers=4, # how many threads?
                 min_count=20, # minimum frequency per token, filtering rare words
                 sample=0.05, # weight of downsampling common words
                 sg = 0, # should we use skip-gram? if 0, then cbow
                 iter=5,
                 hs = 0
        )
    X = model[model.wv.vocab]
    tsne = TSNE(n_components=2)
    X_tsne = tsne.fit_transform(X)
    dic_art[artists] = ((model.wv.vocab.keys(), X_tsne))

In [None]:
from ipywidgets import interact
bokeh_df = pd.DataFrame(dic_art[top_artists[0]][1], dic_art[top_artists[0]][0], columns=['x','y'])
bokeh_df['text_labels'] = bokeh_df.index

    # interactive controls to include to the plot
TOOLS="hover, zoom_in, zoom_out, box_zoom, undo, redo, reset, box_select"

p = figure(tools=TOOLS, plot_width=700, plot_height=700)

# define data source for the plot
source = ColumnDataSource(bokeh_df)

# scatter plot
t = p.scatter('x', 'y', source=source, fill_alpha=0.6,
              fill_color="#8724B5",
              line_color=None)

# text labels
labels = LabelSet(x='x', y='y', text='text_labels', y_offset=8,
                      text_font_size="6pt", text_color="#555555",
                      source=source, text_align='center')

p.add_layout(labels)

def update(artist):
    if artist == top_artists[0]:
        bokeh_df = pd.DataFrame(dic_art[top_artists[0]][1], dic_art[top_artists[0]][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    elif artist == top_artists[1]:
        bokeh_df = pd.DataFrame(dic_art[top_artists[1]][1], dic_art[top_artists[1]][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    else:
        bokeh_df = pd.DataFrame(dic_art[top_artists[2]][1], dic_art[top_artists[2]][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    output_notebook()
    show(p)
    
interact(update, artist = top_artists)

<function __main__.update>

# Inferences from the artist lyric analysis:

The above plot contains words used by beyonce-knowles's lyrics. We can use the artist drop down to change the plot to reflect the other two top artist's word embedding graph. 

1. Beyonce's lyrics seem to under more or less one cluster implying that most of her songs talk about roughly the same subject.
2. Funkadelic's lyrics seem to under 4 different clusters implying that songs are varied in meaning and context.
3. Dreadful Shadow's lyrics seem to be repetitive since there aren't many important words and he is one of the top artists (most number of song contributions). Suggesting that it could be instrumentally inclined. 

In [None]:
genresongs = {}
for ix,row in data.iterrows():
    genre = row['genre']
    if genre in genresongs:
        genresongs[genre].append(row['lyrics'].split(' '))
    else:
        genresongs[genre]=[row['lyrics'].split(' ')]
        
top_genres = ['Rock', 'Hip-Hop', 'Pop']
genre_processed_data = pd.DataFrame({'Genre':list(set(data['genre'].tolist())), 'Lyrics Compilation':[genresongs[genre] for genre in set(data['genre'].tolist())]})
dic = {}
for genres in top_genres:
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for song in genre_processed_data[genre_processed_data['Genre'] == genres]['Lyrics Compilation'].tolist()[0]:
        allwords_stemmed = []
        for line in song:
            # for each item in 'synopses', tokenize/stem
            line = re.sub('[!@#$]', '', line)
            allwords_stemmed.extend(tokenize_and_stem(line)) # extend the 'totalvocab_stemmed' list
        totalvocab_stemmed.append(allwords_stemmed)
    model = Word2Vec(sentences=totalvocab_stemmed, # tokenized senteces, list of list of strings
                 size=300,  # size of embedding vectors
                 workers=4, # how many threads?
                 min_count=20, # minimum frequency per token, filtering rare words
                 sample=0.05, # weight of downsampling common words
                 sg = 0, # should we use skip-gram? if 0, then cbow
                 iter=5,
                 hs = 0
        )
    X = model[model.wv.vocab]
    tsne = TSNE(n_components=2)
    X_tsne = tsne.fit_transform(X)
    dic[genres] = ((model.wv.vocab.keys(), X_tsne))
    #interactive_tsne(model.wv.vocab.keys(), X_tsne)

bokeh_df = pd.DataFrame(dic[top_genres[0]][1], dic[top_genres[0]][0], columns=['x','y'])
bokeh_df['text_labels'] = bokeh_df.index

    # interactive controls to include to the plot
TOOLS="hover, zoom_in, zoom_out, box_zoom, undo, redo, reset, box_select"

plot = figure(tools=TOOLS, plot_width=700, plot_height=700)

# define data source for the plot
source1 = ColumnDataSource(bokeh_df)

# scatter plot
r = plot.scatter('x', 'y', source=source, fill_alpha=0.6,
              fill_color="#B724B5",
              line_color=None)

# text labels
labels1 = LabelSet(x='x', y='y', text='text_labels', y_offset=8,
                      text_font_size="6pt", text_color="#555555",
                      source=source1, text_align='center')

plot.add_layout(labels1)

def update1(genre):
    if genre == 'Rock':
        bokeh_df = pd.DataFrame(dic[top_genres[0]][1], dic[top_genres[0]][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    elif genre == 'Hip-Hop':
        bokeh_df = pd.DataFrame(dic[top_genres[1]][1], dic[top_genres[1]][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        t.data_source.data = source.data
    else:
        bokeh_df = pd.DataFrame(dic[top_genres[2]][1], dic[top_genres[2]][0], columns=['x','y'])
        bokeh_df['text_labels'] = bokeh_df.index
        source = ColumnDataSource(bokeh_df)
        r.data_source.data = source.data
    output_notebook()
    show(plot)
    
interact(update1, genre = top_genres)

# Inferences from the genre lyric analysis

1. Words like good and bad are clustered with a lot of distance between them in rock genre. The choice of words seem binary. Most positive words are clustered on the right and the negative ones such as harm, sore etc are near each other on the left.
2. Hip hop seems to have no clusters implying that there is no lyrical structure in that genre.
3. Words in pop genre form one giant cluster implying that there is lyrical structure in the genre.