# PCA Notebook
Note we're using album as the bag for these tables

## Setup

In [1]:
import numpy as np
import pandas as pd
import plotly_express as px
from sklearn.decomposition import PCA
from scipy.linalg import norm, eigh

OHCO = ['album_id', 'song_num', 'stanza_num', 'line_num', 'token_num']
colors = 'YlGnBu'

## Functions

In [2]:
def get_PCA(X:pd.DataFrame, k:int, norm_docs=True, center_by_mean=False, center_by_variance=False) -> tuple:
    '''
    Function to compute PCA on a given document-term count matrix.

    PARAMETERS:

    X - pandas `DataFrame` representing document-term count matrix to perform PCA on

    k - int number of principal components to return

    norm_docs - bool (defaults to True) of whether to normalize document length. Uses L2 norm

    center_by_mean - bool (defaults to False) of whether to center term vectors by column mean. Uses pandas `.cov()` method

    center_by_variance - bool (defaults to False) of whether to center term vectors by column variance. Uses pandas `.cov()` method

    
    OUTPUTS:

    LOADINGS - pandas `DataFrame` representing term-component matrix

    DCM - pandas `DataFrame` representing document-component matrix

    COMPINF - pandas `DataFrame` representing component information
    '''

    # Input handling to ensure only one centering method is used
    if (center_by_mean and center_by_variance):
        raise SyntaxError('Cannot center by both mean and variance')
    
    # Normalize docs
    if (norm_docs):
        tfidf = (X.T/norm(X, 2, axis=1)).T
    else:
        tfidf = X
    
    # Center by mean
    if (center_by_mean):
        tfidf = tfidf - tfidf.mean()

    # Center by variance
    if (center_by_variance):
        tfidf = tfidf - tfidf.var()
    
    # Compute variance-covariange matrix
    COV = tfidf.cov()

    # Eigendecomposition
    eig_vals, eig_vecs = eigh(COV)

    # Convernt eigenvalues and eigenvectors into DataFrames
    EIG_VECS = pd.DataFrame(eig_vecs, index=COV.index, columns=COV.index)
    EIG_VALS = pd.DataFrame(eig_vals, index=COV.index, columns=['eig_val'])
    EIG_PAIRS = EIG_VALS.join(EIG_VECS.T)\
        .sort_values('eig_val', ascending=False)
    
    EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)

    # Select top k components
    COMPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(k).reset_index(drop=True)
    COMPS.index.name = 'comp_id'
    COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
    COMPS.index.name = 'pc_id'

    # Create LOADINGS
    LOADINGS = COMPS[COV.index].T
    LOADINGS.index.name = 'term_str'

    # Create DCM
    DCM = tfidf.dot(COMPS[COV.index].T)

    # Create COMPINF
    top_terms = []
    for i in range(k):
        for j in [0, 1]:
            comp_str = ' '.join(LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(10).index.to_list())
            top_terms.append((f"PC{i}", j, comp_str))
    COMPINF = pd.DataFrame(top_terms).set_index([0,1]).unstack()
    COMPINF.index.name = 'comp_id'
    COMPINF.columns = COMPINF.columns.droplevel(0) 
    COMPINF = COMPINF.rename(columns={0:'pos', 1:'neg'})

    return (LOADINGS, DCM, COMPINF)

In [3]:
def vis_pcs(M, a, b, label='artist', hover_name='album_title', symbol=None, size=None):
    return px.scatter(M, f"PC{a}", f"PC{b}", color=label, hover_data=['artist', 'album_title', 'genre'],
                     symbol=symbol, size=size,
                     marginal_x='box', marginal_y ='box', height=800)

In [4]:
def vis_loadings(a=0, b=1, hover_name='term_str'):
    X = LOADINGS.join(VOCAB)
    #X = LOADINGS.join(VSHORT)
    return px.scatter(X.reset_index(), f"PC{a}", f"PC{b}", 
                      text='term_str', size='i', color='max_pos', 
                      marginal_x='box', marginal_y ='box', height=800)

## Read Data

In [5]:
LIB = pd.read_csv('../tables/LIB.csv', sep='|').set_index('album_id')
SONG_LIB = pd.read_csv('../tables/SONG_LIB', sep='|').set_index(['album_id', 'song_num'])
CORPUS = pd.read_csv('../tables/CORPUS.csv', sep='|').set_index(OHCO)
VOCAB = pd.read_csv('../tables/VOCAB.csv', sep='|').set_index('term_str')
BOW_SONG = pd.read_csv('../tables/BOW_SONG.csv', sep='|').set_index(['album_id', 'song_num', 'term_str'])
DTCM_SONG = pd.read_csv('../tables/DTCM_SONG.csv', sep='|').set_index(OHCO[:2])
TFIDF_SONG = pd.read_csv('../tables/TFIDF_SONG.csv', sep='|').set_index(OHCO[:2])
TFIDF_L2_SONG = pd.read_csv('../tables/TFIDF_L2_SONG.csv', sep='|').set_index(['album_id', 'song_num'])
BOW_ALBUM = pd.read_csv('../tables/BOW_ALBUM.csv', sep='|').set_index(['album_id', 'term_str'])
DTCM_ALBUM = pd.read_csv('../tables/DTCM_ALBUM.csv', sep='|').set_index(OHCO[:1])
TFIDF_ALBUM = pd.read_csv('../tables/TFIDF_ALBUM.csv', sep='|').set_index(OHCO[:1])
TFIDF_L2_ALBUM = pd.read_csv('../tables/TFIDF_L2_ALBUM.csv', sep='|').set_index(['album_id'])

### Set column indices where relevant

In [6]:
DTCM_SONG.columns.name = 'term_str'
TFIDF_SONG.columns.name = 'term_str'
TFIDF_L2_SONG.columns.name = 'term_str'
DTCM_ALBUM.columns.name = 'term_str'
TFIDF_ALBUM.columns.name = 'term_str'
TFIDF_L2_ALBUM.columns.name = 'term_str'

### Remove NULLs from `TFIDF_L2`

In [25]:
a = len(TFIDF_L2_ALBUM)
TFIDF_L2_ALBUM = TFIDF_L2_ALBUM.dropna()
b = len(TFIDF_L2_ALBUM)
bag_loss = a - b
bag_loss

0

## Generate `LOADINGS`, `DCM`, and `COMPINF`

In [8]:
LOADINGS, DCM, COMPINF = get_PCA(TFIDF_L2_ALBUM, k=10, norm_docs=False, center_by_mean=False, center_by_variance=False)

In [9]:
LOADINGS.head(10).style.background_gradient(cmap=colors)

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
60s,-0.004468,0.004085,0.015749,-0.010716,0.00675,0.020189,-0.009696,-0.014427,0.011097,0.006388
absence,0.003978,-0.016243,0.004935,0.009429,0.005488,-0.018279,0.010666,-0.01546,-0.010365,-0.007887
abuse,-0.005012,0.041491,0.00358,0.007011,0.001768,-0.003975,0.038036,0.01018,0.007895,-0.009992
accident,0.00229,-0.020144,0.005649,0.018312,8.6e-05,0.000375,0.005626,0.01366,0.009564,-0.010889
ace,0.00142,-0.00428,0.004781,-0.000539,0.002769,-0.004073,-0.001524,0.004769,-0.00458,-0.00141
act,-0.004577,-0.01124,0.027815,0.024889,0.010397,0.015621,0.059758,0.007788,0.00107,-0.00257
action,-0.015564,0.016486,0.006491,-0.010909,-0.008829,-0.0196,0.025129,0.007737,-0.018576,0.020161
aeroplane,-0.002926,0.023572,0.013397,0.003168,0.010777,0.014947,-7.2e-05,-0.023157,0.001286,0.026871
affection,-0.001514,-0.007046,0.008329,0.00229,-0.003889,-0.013488,0.002748,-0.005432,-0.004434,0.008312
afternoon,0.057034,-0.022999,-0.087245,-0.001265,-0.008807,0.015907,-0.001379,0.045219,0.037672,-0.059635


In [10]:
LOADINGS.sort_values('PC0', ascending=False).head(10)

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
pills,0.273331,0.207004,-0.086028,-0.053671,0.192381,0.025437,-0.213533,0.089997,-0.189542,0.08385
mouths,0.197171,0.131077,-0.025141,0.044593,0.148787,-0.011287,-0.210985,0.066745,-0.130615,0.130117
girls,0.120459,0.046623,0.047292,-0.075018,0.264333,0.069048,0.024848,-0.018328,-0.098659,0.015549
boys,0.115105,0.093157,-0.007771,-0.006547,0.146431,0.015545,-0.070859,0.014839,-0.087149,0.076266
spinnin,0.108298,0.030643,-0.064164,-0.00038,-0.090299,0.004487,0.025588,-0.018792,0.042567,-0.117113
walkin,0.101213,-0.024124,0.151207,-0.252457,-0.108539,0.25104,-0.061253,-0.032692,0.048228,-0.056303
dreamt,0.100926,-0.019236,-0.017911,0.047277,-0.035731,0.056846,-0.031396,0.009421,0.00758,-0.059468
raindrops,0.090769,-0.023129,-0.038933,0.07756,-0.048327,0.013451,0.021517,0.042372,0.054563,0.009258
passport,0.089285,0.054869,-0.01111,0.013991,0.063944,-0.011977,-0.083175,0.028562,-0.065838,0.045148
discovery,0.086932,-0.017427,0.080608,-0.005076,0.186162,0.053952,0.037478,-0.074904,0.027449,0.046582


In [11]:
LOADINGS.sort_values('PC0', ascending=False).tail(10)

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
reason,-0.079078,-0.011855,0.028566,0.031616,0.001014,0.015918,-0.048293,0.01433,0.006159,-0.004646
ride,-0.082238,0.021936,-0.045267,-0.008519,0.002637,-0.027948,0.024415,-0.063344,-0.021587,0.001224
phone,-0.090546,0.053929,0.060919,0.137739,0.026668,0.036387,-0.165659,0.058041,0.079427,0.145327
woman,-0.091657,-0.03058,-0.143519,-0.042172,-0.008091,0.008755,0.004795,-0.007051,0.014358,0.056897
baby,-0.094059,0.068693,-0.034903,-0.084967,-0.010841,-0.11071,0.015318,-0.031846,-0.011279,-0.033931
diamonds,-0.095828,-0.04556,-0.001011,-0.062854,-0.013345,-0.030761,-0.0283,-0.094079,-0.084974,0.058627
angel,-0.09911,0.049996,-0.035507,-0.079042,0.068909,0.018917,-0.027265,-0.02769,-0.040693,0.019065
step,-0.105766,-0.054901,0.107554,0.082193,0.058891,-0.103545,-0.137623,0.039178,0.150467,-0.088253
age,-0.106423,0.02034,0.073751,0.016212,0.010351,0.046193,-0.069554,0.035364,0.040079,0.015793
hes,-0.165577,0.206505,0.0174,0.055632,-0.104393,0.025431,-0.149914,-0.031288,0.045332,-0.030871


In [12]:
DCM.head()

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,-0.04022,-0.050875,0.227919,-0.064445,0.114236,-0.174584,0.044818,-0.311161,0.331174,0.050253
1,0.043772,-0.064437,0.011421,0.016156,-0.078038,-0.004726,-0.004742,-0.048442,-0.094375,0.004607
2,-0.14348,0.007219,0.120089,0.099909,0.126137,-0.047049,-0.118572,0.135408,0.204617,-0.132772
3,-0.03646,-0.090155,0.200202,-0.301,-0.113627,-0.381609,0.04992,0.214163,-0.086891,0.238792
4,0.00785,0.112372,0.06889,0.05056,-0.093606,-0.002953,0.060993,-0.090224,0.071306,0.07722


In [13]:
DCM.tail()

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
84,-0.204137,-0.068067,0.046023,-0.253027,0.178081,-0.050348,0.091412,0.09425,-0.091833,-0.038496
85,-0.183528,-0.149337,-0.031055,-0.078365,0.214119,0.067778,-0.028474,0.062317,-0.042588,0.0525
86,-0.253527,-0.125688,-0.192801,-0.052993,0.101273,0.055283,-0.049296,-0.027129,0.004924,0.216607
87,-0.137809,0.109473,0.053821,0.159916,0.065436,0.086513,-0.286379,0.033293,0.075341,0.351959
88,-0.22282,0.0076,0.177081,0.011143,0.07937,-0.08869,0.055931,-0.015495,0.11159,0.041198


In [14]:
COMPINF.head()

1,pos,neg
comp_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PC0,pills mouths girls boys spinnin walkin dreamt ...,hes age step angel diamonds baby woman phone r...
PC1,daddys pills hes control mouths guns war work ...,lie energy mystery violet plenty moment sorry ...
PC2,soul walkin everything step denial someones so...,woman truck rhythm violet dusk afternoon pills...
PC3,phone work dark miracle step someones spiral r...,walkin soul arms everything nothin bye lit com...
PC4,girls pills discovery mouths boys lie babies m...,denial someones reject walkin round hes bye li...


## Project DOC (`SONG_LIB`) onto components

In [17]:
DCM = DCM.join(LIB)
DCM.head()

Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,mean_loudness,mean_speechiness,mean_acousticness,mean_instrumentalness,mean_liveness,mean_valence,mean_tempo,album_term_count,album_character_count,genre
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.04022,-0.050875,0.227919,-0.064445,0.114236,-0.174584,0.044818,-0.311161,0.331174,0.050253,...,-5.81,0.07405,0.177,0.6725,0.105,0.456,104.0085,634,2533,alternative dance
1,0.043772,-0.064437,0.011421,0.016156,-0.078038,-0.004726,-0.004742,-0.048442,-0.094375,0.004607,...,-11.596818,0.034264,0.673291,0.564455,0.158773,0.203018,125.244,1661,6626,alternative rock
2,-0.14348,0.007219,0.120089,0.099909,0.126137,-0.047049,-0.118572,0.135408,0.204617,-0.132772,...,-10.276273,0.039882,0.324571,0.433091,0.214909,0.329773,119.461091,1646,6795,art pop
3,-0.03646,-0.090155,0.200202,-0.301,-0.113627,-0.381609,0.04992,0.214163,-0.086891,0.238792,...,-6.816615,0.035585,0.284209,0.23967,0.125769,0.535831,125.053308,3682,14693,chamber pop
4,0.00785,0.112372,0.06889,0.05056,-0.093606,-0.002953,0.060993,-0.090224,0.071306,0.07722,...,-9.2808,0.04187,0.520905,0.45169,0.12299,0.22138,113.4376,1242,4976,alternative rock


In [22]:
vis_pcs(DCM, 0, 1, label='artist')

In [19]:
vis_loadings(0, 1)

In [20]:
vis_pcs(DCM, 1, 2, label='artist')

In [21]:
vis_loadings(1, 2)

## Save Tables

In [26]:
LOADINGS.to_csv('../tables/LOADINGS_ALBUM.csv', sep='|')
DCM.to_csv('../tables/DCM_ALBUM.csv', sep='|')
COMPINF.to_csv('../tables/COMPINF_ALBUM.csv', sep='|')