# PCA Notebook
Note we're using song as the bag for these tables

## Setup

In [1]:
import numpy as np
import pandas as pd
import plotly_express as px
from sklearn.decomposition import PCA
from scipy.linalg import norm, eigh

OHCO = ['album_id', 'song_num', 'stanza_num', 'line_num', 'token_num']
colors = 'YlGnBu'

## Functions

In [2]:
def get_PCA(X:pd.DataFrame, k:int, norm_docs=True, center_by_mean=False, center_by_variance=False) -> tuple:
    '''
    Function to compute PCA on a given document-term count matrix.

    PARAMETERS:

    X - pandas `DataFrame` representing document-term count matrix to perform PCA on

    k - int number of principal components to return

    norm_docs - bool (defaults to True) of whether to normalize document length. Uses L2 norm

    center_by_mean - bool (defaults to False) of whether to center term vectors by column mean. Uses pandas `.cov()` method

    center_by_variance - bool (defaults to False) of whether to center term vectors by column variance. Uses pandas `.cov()` method

    
    OUTPUTS:

    LOADINGS - pandas `DataFrame` representing term-component matrix

    DCM - pandas `DataFrame` representing document-component matrix

    COMPINF - pandas `DataFrame` representing component information
    '''

    # Input handling to ensure only one centering method is used
    if (center_by_mean and center_by_variance):
        raise SyntaxError('Cannot center by both mean and variance')
    
    # Normalize docs
    if (norm_docs):
        tfidf = (X.T/norm(X, 2, axis=1)).T
    else:
        tfidf = X
    
    # Center by mean
    if (center_by_mean):
        tfidf = tfidf - tfidf.mean()

    # Center by variance
    if (center_by_variance):
        tfidf = tfidf - tfidf.var()
    
    # Compute variance-covariange matrix
    COV = tfidf.cov()

    # Eigendecomposition
    eig_vals, eig_vecs = eigh(COV)

    # Convernt eigenvalues and eigenvectors into DataFrames
    EIG_VECS = pd.DataFrame(eig_vecs, index=COV.index, columns=COV.index)
    EIG_VALS = pd.DataFrame(eig_vals, index=COV.index, columns=['eig_val'])
    EIG_PAIRS = EIG_VALS.join(EIG_VECS.T)\
        .sort_values('eig_val', ascending=False)
    
    EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)

    # Select top k components
    COMPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(k).reset_index(drop=True)
    COMPS.index.name = 'comp_id'
    COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
    COMPS.index.name = 'pc_id'

    # Create LOADINGS
    LOADINGS = COMPS[COV.index].T
    LOADINGS.index.name = 'term_str'

    # Create DCM
    DCM = tfidf.dot(COMPS[COV.index].T)

    # Create COMPINF
    top_terms = []
    for i in range(k):
        for j in [0, 1]:
            comp_str = ' '.join(LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(10).index.to_list())
            top_terms.append((f"PC{i}", j, comp_str))
    COMPINF = pd.DataFrame(top_terms).set_index([0,1]).unstack()
    COMPINF.index.name = 'comp_id'
    COMPINF.columns = COMPINF.columns.droplevel(0) 
    COMPINF = COMPINF.rename(columns={0:'pos', 1:'neg'})

    return (LOADINGS, DCM, COMPINF)

In [3]:
def vis_pcs(M, a, b, label='artist', hover_name='title', symbol=None, size=None):
    return px.scatter(M, f"PC{a}", f"PC{b}", color=label, hover_data=['artist', 'title', 'album', 'track_number'],
                     symbol=symbol, size=size,
                     marginal_x='box', height=800)

In [4]:
def vis_loadings(a=0, b=1, hover_name='term_str'):
    X = LOADINGS.join(VOCAB)
    #X = LOADINGS.join(VSHORT)
    return px.scatter(X.reset_index(), f"PC{a}", f"PC{b}", 
                      text='term_str', size='i', color='max_pos', 
                      marginal_x='box', height=800)

## Read Data

In [5]:
LIB = pd.read_csv('../tables/LIB.csv', sep='|').set_index('album_id')
SONG_LIB = pd.read_csv('../tables/SONG_LIB', sep='|').set_index(['album_id', 'song_num'])
CORPUS = pd.read_csv('../tables/CORPUS.csv', sep='|').set_index(OHCO)
VOCAB = pd.read_csv('../tables/VOCAB.csv', sep='|').set_index('term_str')
BOW = pd.read_csv('../tables/BOW_SONG.csv', sep='|').set_index(['album_id', 'song_num', 'term_str'])
DTCM = pd.read_csv('../tables/DTCM_SONG.csv', sep='|').set_index(OHCO[:2])
TFIDF = pd.read_csv('../tables/TFIDF_SONG.csv', sep='|').set_index(OHCO[:2])
TFIDF_L2 = pd.read_csv('../tables/TFIDF_L2_SONG.csv', sep='|').set_index(['album_id', 'song_num'])

### Set column indices where relevant

In [6]:
DTCM.columns.name = 'term_str'
TFIDF.columns.name = 'term_str'
TFIDF_L2.columns.name = 'term_str'

In [7]:
TFIDF_L2

Unnamed: 0_level_0,term_str,admit,afraid,again,ahead,aint,alien,alive,almost,alone,already,...,year,years,yellow,youd,youll,young,youre,yours,youth,youve
album_id,song_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,0.080607,0.116604,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.059938,...,0.0,0.057292,0.0,0.000000,0.000000,0.0,0.087363,0.0,0.000000,0.000000
0,1,0.000000,0.000000,0.549158,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
1,0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
1,1,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
1,2,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.395009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,5,0.000000,0.000000,0.034126,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.038554,0.0,0.172312,0.0,0.069075,0.038753
91,6,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.052607,0.0,0.000000,0.000000,0.0,0.060164,0.0,0.000000,0.040593
91,7,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.105323,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.076757,0.0,0.000000,0.000000
91,8,0.000000,0.000000,0.000000,0.0,0.143172,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.068694,0.000000,0.0,0.049581,0.0,0.000000,0.100358


### Remove NULLs from `TFIDF_L2`

In [8]:
a = len(TFIDF_L2)
TFIDF_L2 = TFIDF_L2.dropna()
b = len(TFIDF_L2)
bag_loss = a - b
bag_loss

8

## Generate `LOADINGS`, `DCM`, and `COMPINF`

In [9]:
LOADINGS, DCM, COMPINF = get_PCA(TFIDF_L2, k=10, norm_docs=False, center_by_mean=False, center_by_variance=False)

In [10]:
LOADINGS.head(10).style.background_gradient(cmap=colors)

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
admit,0.002451,-0.000686,-0.002238,0.00436,0.000151,0.002594,0.000301,0.004247,-0.003283,-0.000771
afraid,0.007607,-0.007014,-0.001308,0.041637,-0.04331,0.027658,0.013895,0.020932,-0.015388,0.004629
again,-0.028124,-0.032547,-0.030773,-0.022023,-0.017743,-0.124288,-0.046008,-0.081565,0.018272,-0.037316
ahead,0.005677,0.013225,-0.004436,-0.005088,0.007706,-0.000516,0.005099,-0.004896,0.019412,0.000356
aint,0.002635,0.001359,-0.006889,0.015119,0.00378,0.014053,-0.034688,-0.039753,-0.05449,-0.000657
alien,-0.00621,-0.014133,0.000295,-0.004746,0.008999,-0.006129,-0.02708,-0.001546,0.00776,-0.002238
alive,-0.008496,0.031024,0.021922,-0.002677,-0.005897,0.021213,-0.007178,-0.032587,-0.093107,0.021352
almost,0.003419,-0.00733,-0.020146,0.002769,-0.003448,0.013014,-0.009655,0.045796,0.03902,-0.004275
alone,-0.040164,-0.029219,0.021885,0.066907,0.003694,-0.015398,0.040454,-0.011203,-0.008539,0.054039
already,-0.003187,-0.022285,-0.016907,0.008515,-0.015219,0.015505,-0.024813,-0.014756,0.03065,-0.003399


In [11]:
DCM.head()

Unnamed: 0_level_0,pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
album_id,song_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,-0.156579,-0.108964,-0.148501,0.061985,-0.110232,0.015552,-0.018331,-0.026064,0.049509,0.051488
0,1,-0.149733,-0.057213,-0.032084,0.028575,-0.050405,-0.119869,-0.133881,-0.209176,-0.144738,-0.148798
1,0,-0.009232,-0.032631,0.021213,0.064602,0.025031,-0.014246,0.036429,0.028644,0.025909,0.092896
1,1,-0.017147,0.001126,-0.020606,0.064183,-0.057723,-0.014021,0.031238,-0.05577,0.049639,0.000454
1,2,-0.043525,0.004738,-0.054372,0.035539,-0.006521,0.021488,0.036492,-0.051879,0.003264,-0.051876


In [12]:
DCM.tail()

Unnamed: 0_level_0,pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
album_id,song_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
91,5,-0.116757,-0.171783,-0.136821,0.071665,-0.113748,-0.014061,-0.110583,0.060438,0.02736,0.010152
91,6,-0.06845,-0.148711,0.059569,-0.02694,-0.222486,0.0158,0.056047,-0.001361,-0.029336,0.012611
91,7,-0.126655,-0.145567,-0.107183,0.077105,-0.054665,0.006685,0.024628,0.00978,0.039644,0.059879
91,8,-0.064518,-0.094885,-0.011958,0.04527,-0.092617,0.016906,-0.003498,-0.00239,-0.038139,-0.038723
91,9,0.013646,-0.020556,-0.021711,0.02324,-0.005267,0.01713,0.038854,-0.033734,-0.00129,-0.034217


In [13]:
COMPINF.head()

1,pos,neg
comp_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PC0,light blue black blood leaves girls round noth...,want wanna dont love really youre need know ma...
PC1,want really stars moon sing free room girls ev...,love wanna youre know dont gonna live just wer...
PC2,yeah wanna live feel gonna dead everybody figh...,youre never time away know need come tell noth...
PC3,know live never away take cant need yeah wanna...,love want release stop come mother gonna wont ...
PC4,wanna dont live leave more wish little right f...,yeah gonna were youre know love come time away...


## Project DOC (`SONG_LIB`) onto components

In [14]:
DCM = DCM.join(SONG_LIB)
DCM.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,album_title
album_id,song_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,-0.156579,-0.108964,-0.148501,0.061985,-0.110232,0.015552,-0.018331,-0.026064,0.049509,0.051488,...,-6.554,1,0.0471,0.138,0.657,0.1,0.186,113.018,4,45:33
0,1,-0.149733,-0.057213,-0.032084,0.028575,-0.050405,-0.119869,-0.133881,-0.209176,-0.144738,-0.148798,...,-5.066,0,0.101,0.216,0.688,0.11,0.726,94.999,4,45:33
1,0,-0.009232,-0.032631,0.021213,0.064602,0.025031,-0.014246,0.036429,0.028644,0.025909,0.092896,...,-6.52,1,0.0297,0.303,0.272,0.109,0.62,148.936,4,A Moon Shaped Pool
1,1,-0.017147,0.001126,-0.020606,0.064183,-0.057723,-0.014021,0.031238,-0.05577,0.049639,0.000454,...,-13.207,0,0.0336,0.968,0.853,0.126,0.113,137.561,3,A Moon Shaped Pool
1,2,-0.043525,0.004738,-0.054372,0.035539,-0.006521,0.021488,0.036492,-0.051879,0.003264,-0.051876,...,-10.827,0,0.0269,0.666,0.837,0.117,0.271,139.149,4,A Moon Shaped Pool


In [15]:
vis_pcs(DCM, 1, 0, label='artist')

In [16]:
vis_loadings(0, 1)