# PCA Notebook
Note we're using album as the bag for these tables

## Setup

In [1]:
import numpy as np
import pandas as pd
import plotly_express as px
from sklearn.decomposition import PCA
from scipy.linalg import norm, eigh

OHCO = ['album_id', 'song_num', 'stanza_num', 'line_num', 'token_num']
colors = 'YlGnBu'

## Functions

In [2]:
def get_PCA(X:pd.DataFrame, k:int, norm_docs=True, center_by_mean=False, center_by_variance=False) -> tuple:
    '''
    Function to compute PCA on a given document-term count matrix.

    PARAMETERS:

    X - pandas `DataFrame` representing document-term count matrix to perform PCA on

    k - int number of principal components to return

    norm_docs - bool (defaults to True) of whether to normalize document length. Uses L2 norm

    center_by_mean - bool (defaults to False) of whether to center term vectors by column mean. Uses pandas `.cov()` method

    center_by_variance - bool (defaults to False) of whether to center term vectors by column variance. Uses pandas `.cov()` method

    
    OUTPUTS:

    LOADINGS - pandas `DataFrame` representing term-component matrix

    DCM - pandas `DataFrame` representing document-component matrix

    COMPINF - pandas `DataFrame` representing component information
    '''

    # Input handling to ensure only one centering method is used
    if (center_by_mean and center_by_variance):
        raise SyntaxError('Cannot center by both mean and variance')
    
    # Normalize docs
    if (norm_docs):
        tfidf = (X.T/norm(X, 2, axis=1)).T
    else:
        tfidf = X
    
    # Center by mean
    if (center_by_mean):
        tfidf = tfidf - tfidf.mean()

    # Center by variance
    if (center_by_variance):
        tfidf = tfidf - tfidf.var()
    
    # Compute variance-covariange matrix
    COV = tfidf.cov()

    # Eigendecomposition
    eig_vals, eig_vecs = eigh(COV)

    # Convernt eigenvalues and eigenvectors into DataFrames
    EIG_VECS = pd.DataFrame(eig_vecs, index=COV.index, columns=COV.index)
    EIG_VALS = pd.DataFrame(eig_vals, index=COV.index, columns=['eig_val'])
    EIG_PAIRS = EIG_VALS.join(EIG_VECS.T)\
        .sort_values('eig_val', ascending=False)
    
    EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)

    # Select top k components
    COMPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(k).reset_index(drop=True)
    COMPS.index.name = 'comp_id'
    COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
    COMPS.index.name = 'pc_id'

    # Create LOADINGS
    LOADINGS = COMPS[COV.index].T
    LOADINGS.index.name = 'term_str'

    # Create DCM
    DCM = tfidf.dot(COMPS[COV.index].T)

    # Create COMPINF
    top_terms = []
    for i in range(k):
        for j in [0, 1]:
            comp_str = ' '.join(LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(10).index.to_list())
            top_terms.append((f"PC{i}", j, comp_str))
    COMPINF = pd.DataFrame(top_terms).set_index([0,1]).unstack()
    COMPINF.index.name = 'comp_id'
    COMPINF.columns = COMPINF.columns.droplevel(0) 
    COMPINF = COMPINF.rename(columns={0:'pos', 1:'neg'})

    return (LOADINGS, DCM, COMPINF)

In [3]:
def vis_pcs(M, a, b, label='artist', hover_name='album_title', symbol=None, size=None):
    return px.scatter(M, f"PC{a}", f"PC{b}", color=label, hover_data=['artist', 'album_title', 'genre'],
                     symbol=symbol, size=size,
                     marginal_x='box', marginal_y ='box', height=800)

In [4]:
def vis_loadings(a=0, b=1, hover_name='term_str'):
    X = LOADINGS.join(VOCAB)
    #X = LOADINGS.join(VSHORT)
    return px.scatter(X.reset_index(), f"PC{a}", f"PC{b}", 
                      text='term_str', size='i', color='max_pos', 
                      marginal_x='box', marginal_y ='box', height=800)

## Read Data

In [5]:
LIB = pd.read_csv('../tables/LIB.csv', sep='|').set_index('album_id')
SONG_LIB = pd.read_csv('../tables/SONG_LIB', sep='|').set_index(['album_id', 'song_num'])
CORPUS = pd.read_csv('../tables/CORPUS.csv', sep='|').set_index(OHCO)
VOCAB = pd.read_csv('../tables/VOCAB.csv', sep='|').set_index('term_str')
BOW_SONG = pd.read_csv('../tables/BOW_SONG.csv', sep='|').set_index(['album_id', 'song_num', 'term_str'])
DTCM_SONG = pd.read_csv('../tables/DTCM_SONG.csv', sep='|').set_index(OHCO[:2])
TFIDF_SONG = pd.read_csv('../tables/TFIDF_SONG.csv', sep='|').set_index(OHCO[:2])
TFIDF_L2_SONG = pd.read_csv('../tables/TFIDF_L2_SONG.csv', sep='|').set_index(['album_id', 'song_num'])
BOW_ALBUM = pd.read_csv('../tables/BOW_ALBUM.csv', sep='|').set_index(['album_id', 'term_str'])
DTCM_ALBUM = pd.read_csv('../tables/DTCM_ALBUM.csv', sep='|').set_index(OHCO[:1])
TFIDF_ALBUM = pd.read_csv('../tables/TFIDF_ALBUM.csv', sep='|').set_index(OHCO[:1])
TFIDF_L2_ALBUM = pd.read_csv('../tables/TFIDF_L2_ALBUM.csv', sep='|').set_index(['album_id'])

### Set column indices where relevant

In [6]:
DTCM_SONG.columns.name = 'term_str'
TFIDF_SONG.columns.name = 'term_str'
TFIDF_L2_SONG.columns.name = 'term_str'
DTCM_ALBUM.columns.name = 'term_str'
TFIDF_ALBUM.columns.name = 'term_str'
TFIDF_L2_ALBUM.columns.name = 'term_str'

### Remove NULLs from `TFIDF_L2`

In [7]:
a = len(TFIDF_L2_ALBUM)
TFIDF_L2_ALBUM = TFIDF_L2_ALBUM.dropna()
b = len(TFIDF_L2_ALBUM)
bag_loss = a - b
bag_loss

0

## Generate `LOADINGS`, `DCM`, and `COMPINF`

In [8]:
LOADINGS, DCM, COMPINF = get_PCA(TFIDF_L2_ALBUM, k=10, norm_docs=False, center_by_mean=False, center_by_variance=False)

In [9]:
LOADINGS.head(10).style.background_gradient(cmap=colors)

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
60s,0.001161,0.00135,-0.001924,0.00378,-0.013955,0.004874,-0.010603,0.006898,-0.001502,0.004925
aah,-0.006225,0.001044,-0.002063,-0.015783,-0.007398,0.005174,0.009828,-0.004204,-0.008727,-0.007666
aback,0.000792,-0.022508,0.006452,0.016714,0.006291,0.016884,0.0068,0.002915,-0.004847,-0.007295
abilities,0.010436,-0.007346,0.01031,0.014582,-0.004165,-0.016953,-0.002229,-0.038231,-0.010688,0.018124
ability,-0.001217,-0.00806,-0.001272,0.003197,0.001544,0.001552,0.000925,-0.003343,0.00374,-0.00324
aboard,0.002737,-0.003798,-0.001693,-0.000548,-0.001427,-0.000159,-0.003018,-8.5e-05,-0.000784,0.003462
absence,0.004602,0.010851,0.00238,0.015988,0.013918,-0.011807,0.023858,-0.017032,0.011258,0.01348
absentee,0.005301,-0.004833,0.016498,0.004272,-0.006411,-0.02127,-0.007195,-0.014666,0.017599,0.015589
abundance,-0.001979,0.002327,-0.001318,0.001002,0.000855,0.000366,-0.001302,0.002902,0.000702,-0.000886
abuse,0.000228,-0.00341,-0.01437,-0.011454,-0.004496,-0.005223,-0.003106,-0.006624,0.008319,0.008884


In [10]:
LOADINGS.sort_values('PC0', ascending=False).head(10)

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
mystery,0.212383,0.25162,0.074624,0.029863,0.063679,0.15067,-0.153024,-0.101521,0.038977,-0.026099
step,0.189536,0.030983,-0.138239,-0.03467,0.214149,0.222779,-0.153678,0.018788,0.033889,0.106309
hes,0.17021,-0.296107,-0.176265,-0.195835,-0.079508,0.13131,-0.002314,0.111191,0.00566,-0.033078
lie,0.161953,0.209978,0.193034,0.029838,0.044935,0.10483,0.274256,0.124084,0.257525,-0.348062
everything,0.137539,0.007595,0.06128,0.178636,-0.095048,0.077513,0.042253,-0.035101,-0.083146,0.078901
body,0.118054,0.070059,-0.053328,0.001041,0.05689,0.027937,0.020363,0.064475,-0.039161,0.058557
cry,0.116408,0.061203,0.081829,-0.051656,-0.051733,-0.088338,0.039831,0.076309,0.006443,-0.016588
touch,0.111292,0.002326,-0.018526,-0.068139,-0.018224,0.012962,-0.000792,0.01104,0.008373,0.024564
phone,0.107106,-0.083106,-0.051459,-0.075402,0.085081,-0.006152,-0.130728,-0.000468,-0.021086,0.006894
side,0.102851,-0.003321,0.045866,-0.025523,0.025945,-0.010491,-0.026386,0.039161,0.023473,-0.030606


In [11]:
LOADINGS.sort_values('PC0', ascending=False).tail(10)

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
raindrops,-0.085434,-0.010348,-0.009228,0.032423,-0.017937,-0.024238,-0.028918,0.005162,0.061599,-0.013204
parallels,-0.08557,0.023075,-0.05237,0.013112,0.015598,-0.030297,-0.001047,0.057175,0.024949,-0.015581
afternoon,-0.085626,0.080315,-0.027049,-0.034869,0.026357,-0.035132,-0.023869,-0.029921,-0.029412,-0.179206
depression,-0.090029,-0.015129,-0.0583,0.004757,-0.026404,0.012881,-0.022559,-0.047071,0.051123,-0.032591
spinnin,-0.10105,0.022568,-0.044268,0.024699,-0.069196,0.019347,-0.040148,-0.005553,0.100516,-0.070569
pow,-0.106238,0.054639,0.000857,-0.000392,-0.018099,0.054101,-0.01496,-0.024068,0.032423,0.001375
bam,-0.117028,0.002871,-0.073714,0.014792,-0.021907,0.028203,-0.00011,-0.012301,0.07144,-0.033464
lovers,-0.117217,-0.04772,0.179191,-0.038904,0.229525,-0.220664,-0.148871,0.237243,-0.050838,0.066961
boys,-0.120326,0.044924,0.208416,-0.208413,0.042215,0.112461,0.047077,-0.005252,-0.081273,0.120106
pills,-0.235144,0.100888,0.264997,-0.328988,-0.033903,0.31874,0.090923,-0.062393,-0.122554,0.254395


In [12]:
DCM.head()

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.327237,0.155055,-0.075503,0.153114,0.146368,0.099176,-0.144137,-0.302251,-0.092195,0.175566
1,-0.067423,-0.048358,0.048669,0.048814,0.045173,-0.039395,0.080481,0.014807,0.054727,-0.024895
2,0.189308,-0.007182,-0.139214,-0.08867,0.185391,0.204796,-0.245296,-0.044979,0.017949,0.18235
3,0.064594,0.064481,-0.05631,0.150398,-0.056433,-0.023444,0.159831,-0.029175,-0.410531,0.18087
4,-0.013808,-0.091058,-0.067843,-0.046471,0.085919,0.00654,-0.04953,-0.099912,0.131244,0.13926


In [13]:
DCM.tail()

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
84,0.24074,0.133332,0.024988,-0.020805,-0.100225,-0.062403,0.120406,-0.02419,-0.109846,0.055612
85,0.400553,0.263824,0.054011,-0.039612,0.008537,0.120739,-0.035191,-0.008386,-0.03472,0.120538
86,0.243886,0.121508,-0.01238,-0.125637,-0.047565,-0.178509,0.014471,0.190492,-0.081725,0.077298
87,0.192364,-0.108041,0.036009,-0.105215,0.086006,-0.148821,-0.173184,0.093399,-0.036535,0.057289
88,0.395193,-0.026462,-0.038079,-0.152631,0.028902,-0.146622,-0.013803,-0.064694,0.036675,0.24299


In [14]:
COMPINF.head()

1,pos,neg
comp_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PC0,mystery step hes lie everything body cry touch...,pills boys lovers bam pow spinnin depression a...
PC1,mystery lie girls moment sea pills arms space ...,someones denial hes round hope work matter wor...
PC2,pills boys mine lie girls lovers days gun hope...,daddys hes step baby guns moment babe arms bro...
PC3,walkin denial everything moment round discover...,pills daddys boys hes lives cartilage brother ...
PC4,lovers step moment stone lives denial cards le...,walkin soul beast fits nobody men daddys crime...


## Project DOC (`SONG_LIB`) onto components

In [15]:
DCM = DCM.join(LIB)
DCM.head()

Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,mean_loudness,mean_speechiness,mean_acousticness,mean_instrumentalness,mean_liveness,mean_valence,mean_tempo,album_term_count,album_character_count,genre
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.327237,0.155055,-0.075503,0.153114,0.146368,0.099176,-0.144137,-0.302251,-0.092195,0.175566,...,-5.81,0.07405,0.177,0.6725,0.105,0.456,104.0085,634,2533,alternative dance
1,-0.067423,-0.048358,0.048669,0.048814,0.045173,-0.039395,0.080481,0.014807,0.054727,-0.024895,...,-11.596818,0.034264,0.673291,0.564455,0.158773,0.203018,125.244,1661,6626,alternative rock
2,0.189308,-0.007182,-0.139214,-0.08867,0.185391,0.204796,-0.245296,-0.044979,0.017949,0.18235,...,-10.276273,0.039882,0.324571,0.433091,0.214909,0.329773,119.461091,1646,6795,art pop
3,0.064594,0.064481,-0.05631,0.150398,-0.056433,-0.023444,0.159831,-0.029175,-0.410531,0.18087,...,-6.816615,0.035585,0.284209,0.23967,0.125769,0.535831,125.053308,3682,14693,chamber pop
4,-0.013808,-0.091058,-0.067843,-0.046471,0.085919,0.00654,-0.04953,-0.099912,0.131244,0.13926,...,-9.2808,0.04187,0.520905,0.45169,0.12299,0.22138,113.4376,1242,4976,alternative rock


In [16]:
vis_pcs(DCM, 0, 1, label='artist')

In [17]:
vis_loadings(0, 1)

In [18]:
vis_pcs(DCM, 1, 2, label='artist')

In [19]:
vis_loadings(1, 2)

## Save Tables

In [20]:
LOADINGS.to_csv('../tables/LOADINGS_ALBUM.csv', sep='|')
DCM.to_csv('../tables/DCM_ALBUM.csv', sep='|')
COMPINF.to_csv('../tables/COMPINF_ALBUM.csv', sep='|')