# Table Building

## Setup

In [1]:
import numpy as np
import pandas as pd
import re
import json
import nltk
from glob import glob
from numpy.linalg import norm

## Useful functions

In [2]:
def get_BOW(corpus:pd.DataFrame, level:str):
    '''
    Function to get bag of words from a corpus

    Corpus here is loosely defined, as this function will work provided the `level` parameter is in the multi-index of `corpus`

    PARAMETERS:

    `corpus` - pandas DataFrame of body of work. It must be multi-indexed by an OHCO

    `level` - string of OHCO to group by for bags

    OUTPUTS:

    pandas DataFrame of bag of words grouiped by `level`

    EXAMPLE:

    `BOW = get_BOW(CORPUS, 'chap_id')`
    '''
    # Get multi-index from `corpus` df
    idx = list(corpus.index.names)

    # Check to see that `level` exists in `corpus` OHCO
    # Raise error if not
    if (level not in idx):
        raise KeyError (f'{level} not found in corpus OHCO')

    # Split-apply-combine to generate BOW grouped by `level`
    return corpus.groupby(idx[:idx.index(level)+1]+['term_str'])\
        .term_str\
        .count()\
        .to_frame('n')

In [3]:
def get_TFIDF(bow:pd.DataFrame, tf_type:str):
    '''
    Function to compute TFIDF and DFIDF for a given bag of words DataFrame

    PARAMETERS:

    `bow` - pandas DataFrame representation of bag of words
    
    `tf_type` - string of term frequency type to use. Options are currently:\n
                sum, max, log, raw, double_norm (defaults to k=1), and binary

    OUTPUTS:
    
    tf-idf vectorized DataFrame
    df-idf vector for terms in vocabulary, can be attached to VOCAB table
    DTCM matrix as a DataFrame

    EXAMPLE:
    `TFIDF, dfidf, DTCM = get_TFIDF(BOW, 'max')`
    '''

    DTCM = bow.n.unstack(fill_value=0)

    # Term frequency calculation dictionary
    if tf_type == 'sum':
        TF = DTCM.T / DTCM.T.sum()

    elif tf_type == 'max':
        TF = DTCM.T / DTCM.T.max()
        
    elif tf_type == 'log':
        TF = np.log2(1 + DTCM.T)
        
    elif tf_type == 'raw':
        TF = DTCM.T
        
    elif tf_type == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        
    elif tf_type == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
        
    TF = TF.T
    
    # Calculate document frequency
    DF = DTCM.astype('bool').sum()
    
    # Calculate number of documents
    N = DTCM.shape[0]

    # idf calculations
    IDF = np.log2(N / DF)
    
    #TFIDF = TF*IDF

    return (TF*IDF), (DF*IDF), DTCM

## Read Data

In [4]:
with open('../data/cleaned_data.json') as f:
    data = json.load(f)
f.close()

## Build `song_lib` table
This might be useful for getting song info later, but ultimately our library will be based on albums as being the "books" with songs as "chapters"

### Initialize dictionary we'll use to make the data frame later

In [5]:
song_lib_dict = dict()

### Loop over json and add songs and metadata to `lib_dict` with an arbitrary song ID

In [6]:
idx = 0
# Loop over artists
for artist in data.keys():
    # Loop over albums
    for album in data[artist]['Albums'].keys():
        # Loop over tracks
        for track in data[artist]['Albums'][album]['tracklist'].keys():
            song_lib_dict[idx] = {
                'title':track,
                'json_path':(f'data[{artist}][\'Albums\'][{album}][\'tracklist\'][track]'),
                'artist':artist,
                'album':album,
                'genres':data[artist]['Metadata']['genres'],
                'release_date':data[artist]['Albums'][album]['release_date'],
                'label':data[artist]['Albums'][album]['label'],
                'track_number':data[artist]['Albums'][album]['tracklist'][track]['track_number'],
                'duration_ms':data[artist]['Albums'][album]['tracklist'][track]['duration_ms'],
                'track_number':data[artist]['Albums'][album]['tracklist'][track]['track_number'],
                'audio_information':data[artist]['Albums'][album]['tracklist'][track]['audio_information'],
                'lyrics':data[artist]['Albums'][album]['tracklist'][track]['lyrics']
            }
            idx += 1

### Convert `lib_dict` to data frame

In [7]:
song_lib = pd.DataFrame.from_dict(song_lib_dict).T
song_lib.index.name = 'song_id'

In [8]:
song_lib.head()

Unnamed: 0_level_0,title,json_path,artist,album,genres,release_date,label,track_number,duration_ms,audio_information,lyrics
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Pay Your Way In Pain,data[St. Vincent]['Albums'][Daddy's Home]['tra...,St. Vincent,Daddy's Home,"[art pop, etherpop, indie rock, indietronica, ...",2021-05-14,Loma Vista Recordings,1,183880,"{'danceability': 0.677, 'energy': 0.826, 'key'...",\n(Ow)\nOh-oh-oh\n\nYou got to pay your way in...
1,Down And Out Downtown,data[St. Vincent]['Albums'][Daddy's Home]['tra...,St. Vincent,Daddy's Home,"[art pop, etherpop, indie rock, indietronica, ...",2021-05-14,Loma Vista Recordings,2,222240,"{'danceability': 0.61, 'energy': 0.502, 'key':...",\nLast night's heels\nOn the mornin' train\nIt...
2,Daddy's Home,data[St. Vincent]['Albums'][Daddy's Home]['tra...,St. Vincent,Daddy's Home,"[art pop, etherpop, indie rock, indietronica, ...",2021-05-14,Loma Vista Recordings,3,199613,"{'danceability': 0.774, 'energy': 0.372, 'key'...",\nI signed autographs in the visitation room\n...
3,Live In The Dream,data[St. Vincent]['Albums'][Daddy's Home]['tra...,St. Vincent,Daddy's Home,"[art pop, etherpop, indie rock, indietronica, ...",2021-05-14,Loma Vista Recordings,4,389546,"{'danceability': 0.426, 'energy': 0.339, 'key'...",\nHello\nDo you know where you are?\nYou've be...
4,The Melting Of The Sun,data[St. Vincent]['Albums'][Daddy's Home]['tra...,St. Vincent,Daddy's Home,"[art pop, etherpop, indie rock, indietronica, ...",2021-05-14,Loma Vista Recordings,5,257519,"{'danceability': 0.52, 'energy': 0.463, 'key':...","\nSo sorry, missed the party\nHello, on the da..."


### Expand `audio_information` to individual columns

In [9]:
audio_info = song_lib.audio_information

In [10]:
audio_frame = audio_info.apply(pd.Series)

In [11]:
audio_frame.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0.677,0.826,9,-6.8,1,0.368,0.395,0.0,0.571,0.498,124.821,audio_features,1nLaoWVhNRZ38irY0LZjL6,spotify:track:1nLaoWVhNRZ38irY0LZjL6,https://api.spotify.com/v1/tracks/1nLaoWVhNRZ3...,https://api.spotify.com/v1/audio-analysis/1nLa...,183880,4
1,0.61,0.502,6,-8.409,1,0.082,0.0578,0.0141,0.166,0.673,151.906,audio_features,4htB9ArZCpquXlXnkKjAgk,spotify:track:4htB9ArZCpquXlXnkKjAgk,https://api.spotify.com/v1/tracks/4htB9ArZCpqu...,https://api.spotify.com/v1/audio-analysis/4htB...,222240,4
2,0.774,0.372,11,-9.796,0,0.0956,0.615,0.00148,0.106,0.602,110.036,audio_features,6DeE4f5DFeWycZz8f6UEht,spotify:track:6DeE4f5DFeWycZz8f6UEht,https://api.spotify.com/v1/tracks/6DeE4f5DFeWy...,https://api.spotify.com/v1/audio-analysis/6DeE...,199613,4
3,0.426,0.339,2,-10.262,1,0.027,0.751,0.0645,0.0871,0.0689,80.224,audio_features,27JieQ9wlcYMf1SwTczPUl,spotify:track:27JieQ9wlcYMf1SwTczPUl,https://api.spotify.com/v1/tracks/27JieQ9wlcYM...,https://api.spotify.com/v1/audio-analysis/27Ji...,389547,4
4,0.52,0.463,10,-10.429,1,0.0958,0.609,0.0014,0.426,0.469,159.931,audio_features,3WD3w5uSzAGJWrNFnHhi1a,spotify:track:3WD3w5uSzAGJWrNFnHhi1a,https://api.spotify.com/v1/tracks/3WD3w5uSzAGJ...,https://api.spotify.com/v1/audio-analysis/3WD3...,257520,4


#### Drop some redundant or unnecessary columns

In [12]:
audio_frame.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature'],
      dtype='object')

In [13]:
audio_frame = audio_frame.drop(['type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms'], axis=1)

In [14]:
audio_frame.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0.677,0.826,9,-6.8,1,0.368,0.395,0.0,0.571,0.498,124.821,4
1,0.61,0.502,6,-8.409,1,0.082,0.0578,0.0141,0.166,0.673,151.906,4
2,0.774,0.372,11,-9.796,0,0.0956,0.615,0.00148,0.106,0.602,110.036,4
3,0.426,0.339,2,-10.262,1,0.027,0.751,0.0645,0.0871,0.0689,80.224,4
4,0.52,0.463,10,-10.429,1,0.0958,0.609,0.0014,0.426,0.469,159.931,4


#### Add `audio_frame` to `lib`

In [15]:
song_lib = song_lib.join(audio_frame)

In [16]:
song_lib.head()

Unnamed: 0_level_0,title,json_path,artist,album,genres,release_date,label,track_number,duration_ms,audio_information,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Pay Your Way In Pain,data[St. Vincent]['Albums'][Daddy's Home]['tra...,St. Vincent,Daddy's Home,"[art pop, etherpop, indie rock, indietronica, ...",2021-05-14,Loma Vista Recordings,1,183880,"{'danceability': 0.677, 'energy': 0.826, 'key'...",...,9,-6.8,1,0.368,0.395,0.0,0.571,0.498,124.821,4
1,Down And Out Downtown,data[St. Vincent]['Albums'][Daddy's Home]['tra...,St. Vincent,Daddy's Home,"[art pop, etherpop, indie rock, indietronica, ...",2021-05-14,Loma Vista Recordings,2,222240,"{'danceability': 0.61, 'energy': 0.502, 'key':...",...,6,-8.409,1,0.082,0.0578,0.0141,0.166,0.673,151.906,4
2,Daddy's Home,data[St. Vincent]['Albums'][Daddy's Home]['tra...,St. Vincent,Daddy's Home,"[art pop, etherpop, indie rock, indietronica, ...",2021-05-14,Loma Vista Recordings,3,199613,"{'danceability': 0.774, 'energy': 0.372, 'key'...",...,11,-9.796,0,0.0956,0.615,0.00148,0.106,0.602,110.036,4
3,Live In The Dream,data[St. Vincent]['Albums'][Daddy's Home]['tra...,St. Vincent,Daddy's Home,"[art pop, etherpop, indie rock, indietronica, ...",2021-05-14,Loma Vista Recordings,4,389546,"{'danceability': 0.426, 'energy': 0.339, 'key'...",...,2,-10.262,1,0.027,0.751,0.0645,0.0871,0.0689,80.224,4
4,The Melting Of The Sun,data[St. Vincent]['Albums'][Daddy's Home]['tra...,St. Vincent,Daddy's Home,"[art pop, etherpop, indie rock, indietronica, ...",2021-05-14,Loma Vista Recordings,5,257519,"{'danceability': 0.52, 'energy': 0.463, 'key':...",...,10,-10.429,1,0.0958,0.609,0.0014,0.426,0.469,159.931,4


### Change `dtypes` to reasonable types

In [17]:
song_lib.dtypes

title                 object
json_path             object
artist                object
album                 object
genres                object
release_date          object
label                 object
track_number          object
duration_ms           object
audio_information     object
lyrics                object
danceability         float64
energy               float64
key                    int64
loudness             float64
mode                   int64
speechiness          float64
acousticness         float64
instrumentalness     float64
liveness             float64
valence              float64
tempo                float64
time_signature         int64
dtype: object

In [18]:
song_lib = song_lib.astype({
    'track_number':'int16',
    'duration_ms':'int32'
})

In [19]:
song_lib.dtypes

title                 object
json_path             object
artist                object
album                 object
genres                object
release_date          object
label                 object
track_number           int16
duration_ms            int32
audio_information     object
lyrics                object
danceability         float64
energy               float64
key                    int64
loudness             float64
mode                   int64
speechiness          float64
acousticness         float64
instrumentalness     float64
liveness             float64
valence              float64
tempo                float64
time_signature         int64
dtype: object

## Build `lib` table
This is based on albums as "books" and songs as "chapters"

In [20]:
LIB = song_lib.groupby('album').agg({
    'artist':'first',
    'genres':'first',
    'release_date':'first',
    'label':'first',
    'danceability':'mean',
    'energy':'mean',
    'loudness':'mean',
    'speechiness':'mean',
    'acousticness':'mean',
    'instrumentalness':'mean',
    'liveness':'mean',
    'valence':'mean',
    'tempo':'mean'
})

In [21]:
LIB = LIB.rename(columns={
    'artist':'artist',
    'genres':'genres',
    'release_date':'release_date',
    'label':'label',
    'danceability':'mean_danceability',
    'energy':'mean_energy',
    'loudness':'mean_loudness',
    'speechiness':'mean_speechiness',
    'acousticness':'mean_acousticness',
    'instrumentalness':'mean_instrumentalness',
    'liveness':'mean_liveness',
    'valence':'mean_valence',
    'tempo':'mean_tempo'
})

In [22]:
LIB = LIB.sort_index(ascending=True).reset_index()
LIB.index.name = 'album_id'
LIB.head()

Unnamed: 0_level_0,album,artist,genres,release_date,label,mean_danceability,mean_energy,mean_loudness,mean_speechiness,mean_acousticness,mean_instrumentalness,mean_liveness,mean_valence,mean_tempo
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,45:33,LCD Soundsystem,"[alternative dance, alternative rock, art pop,...",2007-11-12,Parlophone UK,0.625,0.918,-5.81,0.07405,0.177,0.6725,0.105,0.456,104.0085
1,A Moon Shaped Pool,Radiohead,"[alternative rock, art rock, melancholia, oxfo...",2016-05-08,XL Recordings,0.429545,0.406455,-11.596818,0.034264,0.673291,0.564455,0.158773,0.203018,125.244
2,Actor,St. Vincent,"[art pop, etherpop, indie rock, indietronica, ...",2009-05-05,4AD,0.484091,0.548455,-10.276273,0.039882,0.324571,0.433091,0.214909,0.329773,119.461091
3,Alligator,The National,"[chamber pop, indie folk, indie rock, indietro...",2005-04-11,Beggars Banquet,0.522154,0.740769,-6.816615,0.035585,0.284209,0.23967,0.125769,0.535831,125.053308
4,Amnesiac,Radiohead,"[alternative rock, art rock, melancholia, oxfo...",2001-03-12,XL Recordings,0.4085,0.4951,-9.2808,0.04187,0.520905,0.45169,0.12299,0.22138,113.4376


In [23]:
LIB = LIB.rename(columns={'album':'album_title'})
LIB.head()

Unnamed: 0_level_0,album_title,artist,genres,release_date,label,mean_danceability,mean_energy,mean_loudness,mean_speechiness,mean_acousticness,mean_instrumentalness,mean_liveness,mean_valence,mean_tempo
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,45:33,LCD Soundsystem,"[alternative dance, alternative rock, art pop,...",2007-11-12,Parlophone UK,0.625,0.918,-5.81,0.07405,0.177,0.6725,0.105,0.456,104.0085
1,A Moon Shaped Pool,Radiohead,"[alternative rock, art rock, melancholia, oxfo...",2016-05-08,XL Recordings,0.429545,0.406455,-11.596818,0.034264,0.673291,0.564455,0.158773,0.203018,125.244
2,Actor,St. Vincent,"[art pop, etherpop, indie rock, indietronica, ...",2009-05-05,4AD,0.484091,0.548455,-10.276273,0.039882,0.324571,0.433091,0.214909,0.329773,119.461091
3,Alligator,The National,"[chamber pop, indie folk, indie rock, indietro...",2005-04-11,Beggars Banquet,0.522154,0.740769,-6.816615,0.035585,0.284209,0.23967,0.125769,0.535831,125.053308
4,Amnesiac,Radiohead,"[alternative rock, art rock, melancholia, oxfo...",2001-03-12,XL Recordings,0.4085,0.4951,-9.2808,0.04187,0.520905,0.45169,0.12299,0.22138,113.4376


### Create `album_name` column with no spaces to be able to reference files

In [24]:
LIB['album_name'] = LIB['album_title'].apply(lambda x: re.sub(r' ', '_', x))
LIB.head()

Unnamed: 0_level_0,album_title,artist,genres,release_date,label,mean_danceability,mean_energy,mean_loudness,mean_speechiness,mean_acousticness,mean_instrumentalness,mean_liveness,mean_valence,mean_tempo,album_name
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,45:33,LCD Soundsystem,"[alternative dance, alternative rock, art pop,...",2007-11-12,Parlophone UK,0.625,0.918,-5.81,0.07405,0.177,0.6725,0.105,0.456,104.0085,45:33
1,A Moon Shaped Pool,Radiohead,"[alternative rock, art rock, melancholia, oxfo...",2016-05-08,XL Recordings,0.429545,0.406455,-11.596818,0.034264,0.673291,0.564455,0.158773,0.203018,125.244,A_Moon_Shaped_Pool
2,Actor,St. Vincent,"[art pop, etherpop, indie rock, indietronica, ...",2009-05-05,4AD,0.484091,0.548455,-10.276273,0.039882,0.324571,0.433091,0.214909,0.329773,119.461091,Actor
3,Alligator,The National,"[chamber pop, indie folk, indie rock, indietro...",2005-04-11,Beggars Banquet,0.522154,0.740769,-6.816615,0.035585,0.284209,0.23967,0.125769,0.535831,125.053308,Alligator
4,Amnesiac,Radiohead,"[alternative rock, art rock, melancholia, oxfo...",2001-03-12,XL Recordings,0.4085,0.4951,-9.2808,0.04187,0.520905,0.45169,0.12299,0.22138,113.4376,Amnesiac


### Create `source_file_path` column

In [25]:
LIB['source_file_path'] = LIB.album_name.apply(lambda x: f'../data/albums/{x}.txt')
LIB.head()

Unnamed: 0_level_0,album_title,artist,genres,release_date,label,mean_danceability,mean_energy,mean_loudness,mean_speechiness,mean_acousticness,mean_instrumentalness,mean_liveness,mean_valence,mean_tempo,album_name,source_file_path
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,45:33,LCD Soundsystem,"[alternative dance, alternative rock, art pop,...",2007-11-12,Parlophone UK,0.625,0.918,-5.81,0.07405,0.177,0.6725,0.105,0.456,104.0085,45:33,../data/albums/45:33.txt
1,A Moon Shaped Pool,Radiohead,"[alternative rock, art rock, melancholia, oxfo...",2016-05-08,XL Recordings,0.429545,0.406455,-11.596818,0.034264,0.673291,0.564455,0.158773,0.203018,125.244,A_Moon_Shaped_Pool,../data/albums/A_Moon_Shaped_Pool.txt
2,Actor,St. Vincent,"[art pop, etherpop, indie rock, indietronica, ...",2009-05-05,4AD,0.484091,0.548455,-10.276273,0.039882,0.324571,0.433091,0.214909,0.329773,119.461091,Actor,../data/albums/Actor.txt
3,Alligator,The National,"[chamber pop, indie folk, indie rock, indietro...",2005-04-11,Beggars Banquet,0.522154,0.740769,-6.816615,0.035585,0.284209,0.23967,0.125769,0.535831,125.053308,Alligator,../data/albums/Alligator.txt
4,Amnesiac,Radiohead,"[alternative rock, art rock, melancholia, oxfo...",2001-03-12,XL Recordings,0.4085,0.4951,-9.2808,0.04187,0.520905,0.45169,0.12299,0.22138,113.4376,Amnesiac,../data/albums/Amnesiac.txt


### Add `song_regex` column

In [26]:
LIB['song_regex'] = '\[Trackname:\s[^\]]+\]'

In [27]:
LIB.head()

Unnamed: 0_level_0,album_title,artist,genres,release_date,label,mean_danceability,mean_energy,mean_loudness,mean_speechiness,mean_acousticness,mean_instrumentalness,mean_liveness,mean_valence,mean_tempo,album_name,source_file_path,song_regex
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,45:33,LCD Soundsystem,"[alternative dance, alternative rock, art pop,...",2007-11-12,Parlophone UK,0.625,0.918,-5.81,0.07405,0.177,0.6725,0.105,0.456,104.0085,45:33,../data/albums/45:33.txt,\[Trackname:\s[^\]]+\]
1,A Moon Shaped Pool,Radiohead,"[alternative rock, art rock, melancholia, oxfo...",2016-05-08,XL Recordings,0.429545,0.406455,-11.596818,0.034264,0.673291,0.564455,0.158773,0.203018,125.244,A_Moon_Shaped_Pool,../data/albums/A_Moon_Shaped_Pool.txt,\[Trackname:\s[^\]]+\]
2,Actor,St. Vincent,"[art pop, etherpop, indie rock, indietronica, ...",2009-05-05,4AD,0.484091,0.548455,-10.276273,0.039882,0.324571,0.433091,0.214909,0.329773,119.461091,Actor,../data/albums/Actor.txt,\[Trackname:\s[^\]]+\]
3,Alligator,The National,"[chamber pop, indie folk, indie rock, indietro...",2005-04-11,Beggars Banquet,0.522154,0.740769,-6.816615,0.035585,0.284209,0.23967,0.125769,0.535831,125.053308,Alligator,../data/albums/Alligator.txt,\[Trackname:\s[^\]]+\]
4,Amnesiac,Radiohead,"[alternative rock, art rock, melancholia, oxfo...",2001-03-12,XL Recordings,0.4085,0.4951,-9.2808,0.04187,0.520905,0.45169,0.12299,0.22138,113.4376,Amnesiac,../data/albums/Amnesiac.txt,\[Trackname:\s[^\]]+\]


### Reorder columns

In [28]:
LIB = LIB[[
    'album_name',
    'album_title',
    'artist',
    'source_file_path',
    'song_regex',
    'genres',
    'release_date',
    'label',
    'mean_danceability',
    'mean_energy',
    'mean_loudness',
    'mean_speechiness',
    'mean_acousticness',
    'mean_instrumentalness',
    'mean_liveness',
    'mean_valence',
    'mean_tempo'
]]

In [29]:
LIB.head()

Unnamed: 0_level_0,album_name,album_title,artist,source_file_path,song_regex,genres,release_date,label,mean_danceability,mean_energy,mean_loudness,mean_speechiness,mean_acousticness,mean_instrumentalness,mean_liveness,mean_valence,mean_tempo
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,45:33,45:33,LCD Soundsystem,../data/albums/45:33.txt,\[Trackname:\s[^\]]+\],"[alternative dance, alternative rock, art pop,...",2007-11-12,Parlophone UK,0.625,0.918,-5.81,0.07405,0.177,0.6725,0.105,0.456,104.0085
1,A_Moon_Shaped_Pool,A Moon Shaped Pool,Radiohead,../data/albums/A_Moon_Shaped_Pool.txt,\[Trackname:\s[^\]]+\],"[alternative rock, art rock, melancholia, oxfo...",2016-05-08,XL Recordings,0.429545,0.406455,-11.596818,0.034264,0.673291,0.564455,0.158773,0.203018,125.244
2,Actor,Actor,St. Vincent,../data/albums/Actor.txt,\[Trackname:\s[^\]]+\],"[art pop, etherpop, indie rock, indietronica, ...",2009-05-05,4AD,0.484091,0.548455,-10.276273,0.039882,0.324571,0.433091,0.214909,0.329773,119.461091
3,Alligator,Alligator,The National,../data/albums/Alligator.txt,\[Trackname:\s[^\]]+\],"[chamber pop, indie folk, indie rock, indietro...",2005-04-11,Beggars Banquet,0.522154,0.740769,-6.816615,0.035585,0.284209,0.23967,0.125769,0.535831,125.053308
4,Amnesiac,Amnesiac,Radiohead,../data/albums/Amnesiac.txt,\[Trackname:\s[^\]]+\],"[alternative rock, art rock, melancholia, oxfo...",2001-03-12,XL Recordings,0.4085,0.4951,-9.2808,0.04187,0.520905,0.45169,0.12299,0.22138,113.4376


## Build `CORPUS` table
Using the following OHCO:
[album_id, song_id, stanza_num, line_num, token_num]

This will require a bit of work given we store all of the lyrics at a song level in the .json

### Label song lyrics with song name
For simplicity we also go ahead and make these individual `.txt` files for each album

In [30]:
# # Loop over artists
# for artist in data.keys():
#     # Loop over albums
#     for album in data[artist]['Albums'].keys():
#         album_text = ''
#         album_name = re.sub(r' ', '_', album)
#         # Loop over songs
#         for track in data[artist]['Albums'][album]['tracklist'].keys():
#             lyrics = data[artist]['Albums'][album]['tracklist'][track]['lyrics']
#             lyrics = f'\n\n[Trackname: {track}]\n' + lyrics
#             album_text  = album_text + lyrics
#         with open(f'../data/albums/{album_name}.txt', 'w') as outfile:
#             print(f'Wrote album {album_name} to file {album_name}.txt')
#             print(f'{album_text}', file=outfile)
            

### Parse album `.txt` files into `CORPUS` table

### Define OHCO

In [31]:
OHCO = ['album_id', 'song_num', 'stanza_num', 'line_num', 'token_num']

### Find files

In [32]:
source_file_list = sorted(glob(f"../data/albums/*.*"))

### NLTK setup

In [33]:
nltk_resources = [
    'tokenizers/punkt', 
    'taggers/averaged_perceptron_tagger', 
    'corpora/stopwords', 
    'help/tagsets'
]

In [34]:
for rsc in nltk_resources:
    try:
        nltk.data.find(rsc)
    except IndexError:
        nltk.download(rsc)

### Function to tokenize `album.txt` files

In [35]:
def parse_album(album_id:int, album_name:str, song_regex:str):
    # Read in data and create LINES
    LINES = pd.DataFrame(open(f'../data/albums/{album_name}.txt', 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
    LINES.index.name = 'line_num'
    LINES.line_str = LINES.line_str.str.replace(r'\n', '', regex=True).str.strip()

    # Find song headers
    chap_pat = LIB.iloc[0].song_regex
    chap_lines = LINES.line_str.str.match(chap_pat, case=False)

    # Assign numbers to songs
    LINES.loc[chap_lines, 'song_num'] = [i for i in range(LINES.loc[chap_lines].shape[0])]

    # Forward fill song_num to following text lines
    # These are the lines for the song
    LINES.song_num = LINES.song_num.ffill()

    # Clean up
    LINES = LINES.dropna(subset=['song_num'])       # Remove everything before chapter 1
    LINES = LINES.loc[~chap_lines]                  # Remove chapter heading lines
    LINES.song_num = LINES.song_num.astype('int')   # Convert song_num to int

    # Start grouping into OHCO
    # Change songs into one big string
    SONGS = LINES.groupby(OHCO[1])\
        .line_str.apply(lambda x: '\n'.join(x))\
        .to_frame('song_str')
    SONGS['song_str'] = SONGS.song_str.str.strip()

    # Split songs into stanzas
    # This might not work for all lyrics, given the difference in formatting
    stanza_pat = r'\n\n+'
    STANZAS = SONGS['song_str'].str.split(stanza_pat, expand=True).stack()\
        .to_frame('stanza_str').sort_index()
    STANZAS.index.names = OHCO[1:3]

    # Clean up
    STANZAS['stanza_str'] = STANZAS['stanza_str'].str.strip()       # Strip leading and trailing spaces
    STANZAS = STANZAS[~STANZAS['stanza_str'].str.match(r'^\s*$')]   # Remove empty paragraphs

    # Split stanzas into lines
    line_pat = r'\n'
    LINES = STANZAS['stanza_str'].str.split(line_pat, expand=True).stack()\
        .to_frame('line_str')
    LINES.index.names = OHCO[1:4]

    # Clean up
    LINES = LINES[~LINES['line_str'].str.match(r'^\s*$')]   # Remove empty paragraphs
    LINES.line_str = LINES.line_str.str.strip()             # CRUCIAL TO REMOVE BLANK TOKENS

    # POS Tag based on lines
    # Using
    TOKENS = LINES.line_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
    TOKENS.index.names = OHCO[1:5]

    # Dropped in favor of NLTK processing above
    # # Split lines into tokens
    # token_pat = r"[\s',-]+"
    # TOKENS = LINES['line_str'].str.split(token_pat, expand=True).stack()\
    #     .to_frame('token_str')
    # TOKENS.index.names = OHCO[1:5]
    TOKENS = pd.concat({album_id: TOKENS}, names=[OHCO[0]])

    return TOKENS

### Loop over `LIB` and create `CORPUS`

In [36]:
albums = []

for album_id in LIB.index:
    albums.append(parse_album(album_id, LIB.iloc[album_id].album_name, LIB.iloc[album_id].song_regex))

CORPUS = pd.concat(albums).sort_index()

In [37]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple
album_id,song_num,stanza_num,line_num,token_num,Unnamed: 5_level_1
0,0,0,0,0,"(Shame, NN)"
0,0,0,0,1,"(on, IN)"
0,0,0,0,2,"(you, PRP)"
0,0,0,1,0,"(Shame, NN)"
0,0,0,1,1,"(on, IN)"
...,...,...,...,...,...
88,9,0,41,0,"(On, IN)"
88,9,0,41,1,"(the, DT)"
88,9,0,41,2,"(black, JJ)"
88,9,0,41,3,"(screen, NN)"


## Add `pos`, `token_str`, `term_str`, and `pos_group` to `CORPUS`

In [38]:
CORPUS['pos'] = CORPUS.pos_tuple.apply(lambda x: x[1])
CORPUS['token_str'] = CORPUS.pos_tuple.apply(lambda x: x[0])
CORPUS['term_str'] = CORPUS.token_str.str.lower().str.replace(r"\W+", "", regex=True)
CORPUS['pos_group'] = CORPUS.pos.str[:2]

In [39]:
CORPUS.sample(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
album_id,song_num,stanza_num,line_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
58,3,3,1,4,"(sit, VB)",VB,sit,sit,VB
11,1,2,5,7,"(""What, WP)",WP,"""What",what,WP
12,2,4,1,3,"(you, PRP)",PRP,you,you,PR
8,11,4,2,2,"(you, PRP)",PRP,you,you,PR
84,7,3,1,4,"(me, PRP)",PRP,me,me,PR
39,8,0,1,0,"(And, CC)",CC,And,and,CC
16,6,0,27,3,"(again, RB)",RB,again,again,RB
57,8,0,1,0,"(That, DT)",DT,That,that,DT
1,8,3,1,0,"(In, IN)",IN,In,in,IN
69,3,5,3,2,"(you're, RB)",RB,you're,youre,RB


## Add song length to `LIB`

In [40]:
LIB['album_term_count'] = CORPUS.groupby('album_id').term_str.count()
LIB['album_character_count'] = CORPUS.groupby('album_id')['token_str']\
    .apply(lambda x: sum(x.str.len()))

## Drop any weirdness from `CORPUS`

In [41]:
CORPUS = CORPUS[CORPUS.term_str != '']

In [42]:
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
album_id,song_num,stanza_num,line_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0,0,0,"(Shame, NN)",NN,Shame,shame,NN
0,0,0,0,1,"(on, IN)",IN,on,on,IN
0,0,0,0,2,"(you, PRP)",PRP,you,you,PR
0,0,0,1,0,"(Shame, NN)",NN,Shame,shame,NN
0,0,0,1,1,"(on, IN)",IN,on,on,IN


## Build `VOCAB` table

### Build core of the table

In [43]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()    
VOCAB['p'] = VOCAB.n/VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str', 'pos']].value_counts()\
    .unstack(fill_value=0)\
    .idxmax(1)
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts()\
    .unstack(fill_value=0)\
    .idxmax(1)

### Add `sw`

### Expand stopword list
From work later there are some stopwords making it through that I want to remove

In [44]:
stops = nltk.corpus.stopwords.words('english').copy()

In [45]:
expand_stops = [w.replace("'", '') for w in nltk.corpus.stopwords.words('english')]

In [46]:
adds = ['gonna', 'yeah', 'oh', 'o', 'ah', 'yeah', 'yah', 'yeh', 'ya', 'y', 'ooo', 'oo', 'ooh', 'doo', 'doot', 'da', 'non', 'na', 'nah',
        'la', 'lah', 'lala', 'lahlah', 'hey', 'woo', 'woah', 'nanananana']

In [47]:
expand_stops.extend(adds)

In [48]:
stops.extend(expand_stops)

In [49]:
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [50]:
VOCAB = VOCAB.drop_duplicates()

In [51]:
sw = pd.DataFrame({'stop': 1}, index=stops)
sw.index.name='term_str'

In [52]:
if 'stop' not in VOCAB.columns:
    VOCAB = VOCAB.join(sw)
    VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

### Add `porter_stem`

In [53]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
VOCAB['porter_stem'] = VOCAB.apply(lambda x: stemmer.stem(x.name), 1)

## Generate `BOW`
We'll use song as the bag

In [54]:
BOW_SONG = get_BOW(CORPUS, 'song_num')

## Generate `DTM`, `dfidf`, and `TFIDF`
Went ahead and added `dfidf` to `VOCAB` here for simplicity

In [55]:
TFIDF_SONG, VOCAB['song_dfidf'], DTCM_SONG = get_TFIDF(BOW_SONG, 'max')

In [56]:
DTCM_SONG.head()

Unnamed: 0_level_0,term_str,1,10cc,10th,1st,2,23rd,2pac,3,30th,4,...,zonin,zoning,zoom,zoë,à,еh,еnding,еye,政権の腐敗,政権の腐敗敗
album_id,song_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
TFIDF_SONG.head()

Unnamed: 0_level_0,term_str,1,10cc,10th,1st,2,23rd,2pac,3,30th,4,...,zonin,zoning,zoom,zoë,à,еh,еnding,еye,政権の腐敗,政権の腐敗敗
album_id,song_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create `TFIDF_L2_SONG`
* Still using song as the bag
* Using top 2000 terms by `dfidf`
* Using POS list: `['NN', 'NNS']`
* Taking terms excluding stopwords
    * This reveals a bit of a shortcoming for stopwords for albums as stopwords can be a lot of vocal filler it seems
    * Additionally, we sent vocalizations to stopwords
    * Some songs are very full of vocalizations, and we don't want to examine that right now

### Generate `sig_terms`

In [58]:
# POS list
#pos_list = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
pos_list = ['NN', 'NNS']

# Create significant term list
sig_terms = list(VOCAB.query(f'max_pos in {pos_list} and stop != 1')\
    .sort_values('song_dfidf', ascending=False)[:2000]\
    .index)

### Build and collapse `TFIDF_SUB` into `TFIDF_L2`

In [59]:
TFIDF_SUB_SONG = TFIDF_SONG[TFIDF_SONG.columns.intersection(sig_terms)]
TFIDF_SUB_SONG = TFIDF_SUB_SONG.groupby(OHCO[:2]).mean()
TFIDF_L2_SONG = TFIDF_SUB_SONG.apply(lambda x: x / norm(x), 1).fillna(0)  # Pythagorean, AKA Euclidean

## Reorganize `SONG_LIB`

In [60]:
SONG_LIB = song_lib.merge(LIB['album_title'].reset_index(), left_on='album', right_on='album_title')
SONG_LIB = SONG_LIB.reset_index()
SONG_LIB = SONG_LIB.rename(columns={'index':'song_id'})
SONG_LIB = SONG_LIB.set_index(['album_id', 'song_id'])
SONG_LIB = SONG_LIB.sort_index(ascending=True)
SONG_LIB = SONG_LIB.reset_index('song_id', drop=True)
SONG_LIB = SONG_LIB.set_index(SONG_LIB.groupby(level=0).cumcount().rename('song_num'), append=True)

In [61]:
SONG_LIB.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title,json_path,artist,album,genres,release_date,label,track_number,duration_ms,audio_information,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,album_title
album_id,song_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,45:33,data[LCD Soundsystem]['Albums'][45:33]['trackl...,LCD Soundsystem,45:33,"[alternative dance, alternative rock, art pop,...",2007-11-12,Parlophone UK,1,2764933,"{'danceability': 0.478, 'energy': 0.897, 'key'...",...,-6.554,1,0.0471,0.138,0.657,0.1,0.186,113.018,4,45:33
0,1,Freak Out / Starry Eyes,data[LCD Soundsystem]['Albums'][45:33]['trackl...,LCD Soundsystem,45:33,"[alternative dance, alternative rock, art pop,...",2007-11-12,Parlophone UK,2,742053,"{'danceability': 0.772, 'energy': 0.939, 'key'...",...,-5.066,0,0.101,0.216,0.688,0.11,0.726,94.999,4,45:33
1,0,Burn the Witch,data[Radiohead]['Albums'][A Moon Shaped Pool][...,Radiohead,A Moon Shaped Pool,"[alternative rock, art rock, melancholia, oxfo...",2016-05-08,XL Recordings,1,220609,"{'danceability': 0.54, 'energy': 0.847, 'key':...",...,-6.52,1,0.0297,0.303,0.272,0.109,0.62,148.936,4,A Moon Shaped Pool
1,1,Daydreaming,data[Radiohead]['Albums'][A Moon Shaped Pool][...,Radiohead,A Moon Shaped Pool,"[alternative rock, art rock, melancholia, oxfo...",2016-05-08,XL Recordings,2,384438,"{'danceability': 0.214, 'energy': 0.263, 'key'...",...,-13.207,0,0.0336,0.968,0.853,0.126,0.113,137.561,3,A Moon Shaped Pool
1,2,Decks Dark,data[Radiohead]['Albums'][A Moon Shaped Pool][...,Radiohead,A Moon Shaped Pool,"[alternative rock, art rock, melancholia, oxfo...",2016-05-08,XL Recordings,3,281011,"{'danceability': 0.557, 'energy': 0.501, 'key'...",...,-10.827,0,0.0269,0.666,0.837,0.117,0.271,139.149,4,A Moon Shaped Pool


### Add Genre to `SONG_LIB` and `LIB` using first item in list

In [62]:
SONG_LIB['genre'] = SONG_LIB['genres'].apply(lambda x : x[0])

In [63]:
LIB['genre'] = LIB['genres'].apply(lambda x: x[0])

## Generate tables with Album as the bag

## Generate `BOW`
We'll use song as the bag

In [64]:
BOW_ALBUM = get_BOW(CORPUS, 'album_id')

## Generate `DTM`, `dfidf`, and `TFIDF`
Went ahead and added `dfidf` to `VOCAB` here for simplicity

In [65]:
TFIDF_ALBUM, VOCAB['album_dfidf'], DTCM_ALBUM = get_TFIDF(BOW_ALBUM, 'max')

In [66]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,stop,porter_stem,song_dfidf,album_dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,4,1,2.1e-05,15.555301,CD,CD,0,1,31.58533,17.902934
10cc,1,4,5e-06,17.555301,CD,CD,0,10cc,9.896332,6.475733
10th,5,4,2.6e-05,15.233373,CD,CD,0,10th,17.792665,10.951467
1st,1,3,5e-06,17.555301,CD,CD,0,1st,9.896332,6.475733
2,11,1,5.7e-05,14.095869,CD,CD,0,2,55.170659,25.67865


In [67]:
DTCM_ALBUM.head()

term_str,1,10cc,10th,1st,2,23rd,2pac,3,30th,4,...,zonin,zoning,zoom,zoë,à,еh,еnding,еye,政権の腐敗,政権の腐敗敗
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
TFIDF_ALBUM.head()

term_str,1,10cc,10th,1st,2,23rd,2pac,3,30th,4,...,zonin,zoning,zoom,zoë,à,еh,еnding,еye,政権の腐敗,政権の腐敗敗
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
tfidf = TFIDF_ALBUM.stack().to_frame()
tfidf = tfidf.rename(columns={0:'tfidf'})
tfidf

Unnamed: 0_level_0,Unnamed: 1_level_0,tfidf
album_id,term_str,Unnamed: 2_level_1
0,1,0.0
0,10cc,0.0
0,10th,0.0
0,1st,0.0
0,2,0.0
...,...,...
88,еh,0.0
88,еnding,0.0
88,еye,0.0
88,政権の腐敗,0.0


In [70]:
BOW_ALBUM = BOW_ALBUM.join(tfidf)
BOW_ALBUM

Unnamed: 0_level_0,Unnamed: 1_level_0,n,tfidf
album_id,term_str,Unnamed: 2_level_1,Unnamed: 3_level_1
0,a,6,0.000000
0,about,1,0.015801
0,accept,1,0.135855
0,admit,1,0.101899
0,afraid,2,0.108454
...,...,...,...
88,your,52,0.006143
88,youre,42,0.052333
88,yourself,3,0.025877
88,youth,1,0.023955


## Create `TFIDF_L2_ALBUM`
* Using album as the bag
* Using top 2000 terms by `dfidf`
* Using POS list: `['NN', 'NNS']`
* See notes for SONG bag

### Generate `sig_terms`

In [71]:
# POS list
#pos_list = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
pos_list = ['NN', 'NNS']

# Create significant term list
sig_terms = list(VOCAB.query(f'max_pos in {pos_list} and stop != 1')\
    .sort_values('album_dfidf', ascending=False)[:2000]\
    .index)

### Build and collapse `TFIDF_SUB` into `TFIDF_L2`

In [72]:
TFIDF_SUB_ALBUM = TFIDF_ALBUM[TFIDF_ALBUM.columns.intersection(sig_terms)]
TFIDF_SUB_ALBUM = TFIDF_SUB_ALBUM.groupby(OHCO[:1]).mean()
TFIDF_L2_ALBUM = TFIDF_SUB_ALBUM.apply(lambda x: x / norm(x), 1).fillna(0)  # Pythagorean, AKA Euclidean

## Save tables to .csv

In [73]:
VOCAB = VOCAB.drop_duplicates()

In [74]:
LIB.to_csv(f'../tables/LIB.csv', sep='|')
SONG_LIB.to_csv(f'../tables/SONG_LIB', sep='|')
CORPUS.to_csv(f'../tables/CORPUS.csv', sep='|')
VOCAB.to_csv(f'../tables/VOCAB.csv', sep='|')
BOW_SONG.to_csv(f'../tables/BOW_SONG.csv', sep='|')
DTCM_SONG.to_csv(f'../tables/DTCM_SONG.csv', sep='|')
TFIDF_SONG.to_csv(f'../tables/TFIDF_SONG.csv', sep='|')
TFIDF_L2_SONG.to_csv(f'../tables/TFIDF_L2_SONG.csv', sep='|')
BOW_ALBUM.to_csv(f'../tables/BOW_ALBUM.csv', sep='|')
DTCM_ALBUM.to_csv(f'../tables/DTCM_ALBUM.csv', sep='|')
TFIDF_ALBUM.to_csv(f'../tables/TFIDF_ALBUM.csv', sep='|')
TFIDF_L2_ALBUM.to_csv(f'../tables/TFIDF_L2_ALBUM.csv', sep='|')