# Working with the lyrics Bag of Word

In [1]:
import sqlite3
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_palette('Dark2')
sns.set_style('whitegrid')

In [2]:
con_mxm = sqlite3.connect('../mxm_dataset.db')
cur_mxm = con_mxm.cursor()

# displaying the different table available
tables = con_mxm.execute("SELECT name FROM sqlite_master WHERE type='table'")
table_names = tables.fetchall()

print('Tables within the database :')
print('{}'.format(table_names[0][0]))
print('{}'.format(table_names[1][0]))

Tables within the database :
words
lyrics


In [3]:
# import the "words" table in a pandas DataFrame
words = pd.read_sql_query("SELECT * FROM words",con_mxm)
print(words.info())
words.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 1 columns):
word    5000 non-null object
dtypes: object(1)
memory usage: 39.1+ KB
None


Unnamed: 0,word
0,i
1,the
2,you
3,to
4,and


In [5]:
# import the "lyrics" table in a pandas DataFrame
# limit to 10055 to have the exact content of each song (120 songs)
lyrics = pd.read_sql_query("SELECT *\
                           FROM lyrics\
                           ORDER BY track_id ASC\
                           LIMIT 10055",con_mxm)
#lyrics = lyric.to_sparse(fill_value=0)

print(lyrics.info())
lyrics.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10055 entries, 0 to 10054
Data columns (total 5 columns):
track_id    10055 non-null object
mxm_tid     10055 non-null int64
word        10055 non-null object
count       10055 non-null int64
is_test     10055 non-null int64
dtypes: int64(3), object(2)
memory usage: 392.9+ KB
None


Unnamed: 0,track_id,mxm_tid,word,count,is_test
0,TRAAAAV128F421A322,4623710,i,6,0
1,TRAAAAV128F421A322,4623710,the,4,0
2,TRAAAAV128F421A322,4623710,you,2,0
3,TRAAAAV128F421A322,4623710,to,2,0
4,TRAAAAV128F421A322,4623710,and,5,0


## Removing stopwords

In [6]:
stp_wds = stopwords.words()

In [7]:
words_no_stopwords = words[~np.isin(words.word, stp_wds)]
words_no_stopwords.head(5)

Unnamed: 0,word
26,love
28,know
35,like
38,time
43,go


In [8]:
lyrics_no_stopwords = lyrics[~np.isin(lyrics.word, stp_wds)]
print(lyrics_no_stopwords.info())
lyrics_no_stopwords.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5993 entries, 22 to 10054
Data columns (total 5 columns):
track_id    5993 non-null object
mxm_tid     5993 non-null int64
word        5993 non-null object
count       5993 non-null int64
is_test     5993 non-null int64
dtypes: int64(3), object(2)
memory usage: 280.9+ KB
None


Unnamed: 0,track_id,mxm_tid,word,count,is_test
22,TRAAAAV128F421A322,4623710,like,2,0
27,TRAAAAV128F421A322,4623710,got,1,0
28,TRAAAAV128F421A322,4623710,would,1,0
31,TRAAAAV128F421A322,4623710,seem,1,0
32,TRAAAAV128F421A322,4623710,someon,1,0


## Function to choose a given number of songs

In [52]:
from sklearn.utils import shuffle

def get_n_songs(lyrics_df, n_songs=1 ,random=False):
    
    if random == True :
        track_to_keep = lyrics_df.track_id.sample(n=2)
    elif random == False :
        track_to_keep = lyrics_df.track_id.unique()[:n_songs]

    lyrics_subset = lyrics_df[lyrics_df['track_id'].isin(track_to_keep)]
    
    return lyrics_subset

In [61]:
lyrics_sub = get_n_songs(lyrics, n_songs=2, random=True)
lyrics_sub.track_id.unique()

array(['TRAACIE128F428495B', 'TRAAEJQ128F92C484E'], dtype=object)

## Function to pivot by chunks

In [114]:
def pivot_by_chunks(lyrics_df, n_chunks=3, sparse=True):
    
    print('Processing chunk number 0')
    track_list = np.array_split(lyrics_df.track_id.unique(), n_chunks)
    df0 = lyrics_df[lyrics_df['track_id'].isin(track_list[0])]
    pivot_df = df0.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
    del df0
    pivot_df = pivot_df.to_sparse(fill_value=0)

    for i in range(1, n_chunks):
        print('Processing chunk number {}'.format(i))
        df_tmp = lyrics_df[lyrics_df['track_id'].isin(track_list[i])]
        pivot_df_tmp = df_tmp.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
        pivot_df.append(pivot_df_tmp).fillna(0)
        del df_tmp
        pivot_df = pivot_df.to_sparse(fill_value=0)

    return pivot_df

In [118]:
test_df = pivot_by_chunks(lyrics_df=lyrics, n_chunks=2)

Processing chunk number 0
Processing chunk number 1


In [119]:
test_df.info()

<class 'pandas.core.sparse.frame.SparseDataFrame'>
Index: 60 entries, TRAAAAV128F421A322 to TRAAEHW128F9344FD3
Columns: 1624 entries, & to è
dtypes: int64(1624)
memory usage: 40.6+ KB


## Pivoting the Tables

In [45]:
lyrics.groupby('track_id').count()

Unnamed: 0_level_0,mxm_tid,word,count,is_test
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TRAAAAV128F421A322,68,68,68,68
TRAAABD128F429CF47,74,74,74,74
TRAAAED128E0783FAB,120,120,120,120
TRAAAEF128F4273421,59,59,59,59
TRAAAEW128F42930C0,63,63,63,63
TRAAAFD128F92F423A,82,82,82,82
TRAAAGF12903CEC202,19,19,19,19
TRAAAHJ128F931194C,80,80,80,80
TRAAAHZ128E0799171,224,224,224,224
TRAAAJG128F9308A25,56,56,56,56


In [None]:
# create a table with track_is as index and word as columns
lyrics_mat = lyrics.pivot_table(index='track_id', columns='word', values='count',
                                fill_value=0)
print(lyrics_mat.info())
lyrics_mat.head(5)

In [None]:
dfs = pd.SparseDataFrame([pd.SparseSeries(lyrics[i].toarray().ravel(), fill_value=0) 
                              for i in np.arange(sparse_matrix.shape[0]) ],
                         index=, columns=, default_fill_value=0)

In [None]:
lyrics_mat.to_sparse(fill_value=0).info()

In [None]:
# create a table with track_is as index and word as columns (no stopwords)
lyrics_no_stopwords_mat =lyrics_no_stopwords.pivot_table(index='track_id', columns='word', values='count',
                                                         fill_value=0)
lyrics_no_stopwords_mat.head(5)

... and saving them (database style)

In [None]:
con = sqlite3.connect('lyrics_BOW.db')
lyrics_mat.to_sql('lyrics', con, index=False)
con.commit()
con.close()

In [None]:
con = sqlite3.connect('lyrics_no_stopwords_BOW.db')
lyrics_no_stopwords_mat.to_sql('lyrics', con, index=False)
con.commit()
con.close()

## Projections

Using some dimensionality reduction

### PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
lyrics_no_pca = pca.fit_transform(lyrics_no_stopwords_mat)

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,10))

ax[1].scatter(lyrics_no_pca[:,0], lyrics_no_pca[:,1], marker='.')
ax[1].axis('scaled')
ax[0].scatter(lyrics_no_pca[:,0], lyrics_no_pca[:,1], marker='.')
ax[0].axis('scaled')

ax[1].set_xlim(-5.0,8.0)
ax[1].set_ylim(-5.0,8.0);

### Isomap

In [None]:
from sklearn.manifold import Isomap

In [None]:
iso = Isomap(n_components=2)
lyrics_no_iso = iso.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_iso[:,0], lyrics_no_iso[:,1], marker='.')
plt.axis('scaled');

### LLE

In [None]:
from sklearn.manifold import LocallyLinearEmbedding

In [None]:
lle = LocallyLinearEmbedding(n_components=2)
lyrics_no_lle = lle.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_lle[:,0], lyrics_no_lle[:,1], marker='.')
plt.axis('scaled');

### TSNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=2)
lyrics_no_tsne = tsne.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_tsne[:,0], lyrics_no_tsne[:,1], marker='.')
plt.axis('scaled');

## MDS

In [None]:
from sklearn.manifold import MDS

In [None]:
mds = MDS(n_components=2)
lyrics_no_mds = mds.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_mds[:,0], lyrics_no_mds[:,1], marker='.')
plt.axis('scaled');

## Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
agg = AgglomerativeClustering(n_clusters=5)
agg_preds = agg.fit_predict(lyrics_no_stopwords_mat)

In [None]:
from sklearn.cluster import AffinityPropagation

In [None]:
afp = AffinityPropagation(damping=0.95)
afp_preds = afp.fit_predict(lyrics_no_stopwords_mat)

In [None]:
np.unique(afp_preds)

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(lyrics_no_tsne[:,0], lyrics_no_tsne[:,1], marker='.', c=afp_preds, cmap='gist_rainbow')
plt.axis('scaled');

In [None]:
lyrics_mat.to_csv('lyrics_pivot.csv')