# Principal Component Analysis

## 0.0 Import Relevant Libraries

In [73]:
# the two must have libraries in any data science project 
import pandas as pd 
import numpy as np


from sklearn.decomposition import PCA
from scipy.linalg import norm
from scipy.linalg import eigh
import plotly_express as px
import seaborn as sns

## 0.1 Import data and Define Global Variables

In [97]:
# relative path to where data will be stored. 
data_dir = "datasets/"
# relative path to our visualization directory
vis_dir = "visualizations/"

OHCO = ['track.id','section_num','line_num','token_num'] # define our object heiarchy 

## Load our tables

In [75]:
VOCAB_english = pd.read_csv(data_dir+'VOCAB_english.csv',index_col='term_str')
VOCAB_english

Unnamed: 0_level_0,n,p,i,n_chars,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
the,122130,5.288865e-02,4.240898,3,DT
of,61190,2.649846e-02,5.237948,2,IN
and,61158,2.648460e-02,5.238702,3,CC
to,60007,2.598616e-02,5.266113,2,TO
a,43364,1.877887e-02,5.734746,1,DT
...,...,...,...,...,...
tumbado,1,4.330521e-07,21.138956,7,NNP
natanael,1,4.330521e-07,21.138956,8,NNP
tracklist,1,4.330521e-07,21.138956,9,NNP
moonbyul,1,4.330521e-07,21.138956,8,NNP


In [76]:
TOKENS_english = pd.read_csv(data_dir+'TOKENS_english.csv')
TOKENS_english

Unnamed: 0,track.id,section_num,line_num,token_num,pos_tuple,pos,token_str,term_str
0,00Mb3DuaIH1kjrwOku9CGU,1,1,0,"('He', 'PRP')",PRP,He,he
1,00Mb3DuaIH1kjrwOku9CGU,1,1,1,"('was', 'VBD')",VBD,was,was
2,00Mb3DuaIH1kjrwOku9CGU,1,1,2,"('a', 'DT')",DT,a,a
3,00Mb3DuaIH1kjrwOku9CGU,1,1,3,"('boy', 'NN')",NN,boy,boy
4,00Mb3DuaIH1kjrwOku9CGU,1,1,4,"('she', 'PRP')",PRP,she,she
...,...,...,...,...,...,...,...,...
2309186,7zb1n2Rdmh5arJ3WOp2Z5c,2,4,4,"('of', 'IN')",IN,of,of
2309187,7zb1n2Rdmh5arJ3WOp2Z5c,2,4,5,"('salvation', 'NN')",NN,salvation,salvation
2309188,7zb1n2Rdmh5arJ3WOp2Z5c,2,4,6,"('take', 'VB')",VB,take,take
2309189,7zb1n2Rdmh5arJ3WOp2Z5c,2,4,7,"('me', 'PRP')",PRP,me,me


In [77]:
LIB_english = pd.read_csv(data_dir+'LIB_english.csv').set_index('track.id')
LIB_english

Unnamed: 0_level_0,added_at,track.album.album_group,track.album.album_type,track.album.artists,track.album.available_markets,track.album.external_urls.spotify,track.album.href,track.album.id,track.album.images,track.album.is_playable,...,track.popularity,track.preview_url,track.track_number,track.type,track.uri,genre,artist,song,language,language_name
track.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7HdXRMw14roDx2a0COWk3M,2023-04-11T15:06:37Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT...",https://open.spotify.com/album/6tG8sCK4htJOLjl...,https://api.spotify.com/v1/albums/6tG8sCK4htJO...,6tG8sCK4htJOLjlWwb7gZB,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,74,https://p.scdn.co/mp3-preview/41728b2155b6603b...,1,track,spotify:track:7HdXRMw14roDx2a0COWk3M,"['candy pop', 'modern rock', 'pixie', 'pop', '...",Paramore,This Is Why,en,English
4OmFmE0fzcMG6g0Y8p4eSD,2023-04-10T01:13:38Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT...",https://open.spotify.com/album/6kZ42qRrzov54Lc...,https://api.spotify.com/v1/albums/6kZ42qRrzov5...,6kZ42qRrzov54LcAk4onW9,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,70,https://p.scdn.co/mp3-preview/63cfaaef1a487995...,22,track,spotify:track:4OmFmE0fzcMG6g0Y8p4eSD,['pop'],Taylor Swift,Better Man (Taylor's Version) (From The Vault),en,English
5ZjFa8NE9MHKBPNefxIh88,2023-04-09T07:18:17Z,single,single,[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT...",https://open.spotify.com/album/2OZO8I4Z79M8VN2...,https://api.spotify.com/v1/albums/2OZO8I4Z79M8...,2OZO8I4Z79M8VN2H0wgjEp,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,66,https://p.scdn.co/mp3-preview/eaf69b89d382a2e1...,2,track,spotify:track:5ZjFa8NE9MHKBPNefxIh88,[],FYLOW,Give It To Me - Sped Up Version,en,English
171SFtWNviZ7Tp10zxNdpN,2023-04-09T02:07:04Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT...",https://open.spotify.com/album/4aJQ9c9XNSJ9eiC...,https://api.spotify.com/v1/albums/4aJQ9c9XNSJ9...,4aJQ9c9XNSJ9eiCrmqH3S3,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,9,https://p.scdn.co/mp3-preview/81fa22cf7f759d9c...,24,track,spotify:track:171SFtWNviZ7Tp10zxNdpN,"['american folk revival', 'folk', 'protest fol...",Woody Guthrie,Union Maid,en,English
0bCCMwWTaYOcQ4v8EeEYmd,2023-04-04T23:47:45Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT...",https://open.spotify.com/album/1kTlYbs28MXw7hw...,https://api.spotify.com/v1/albums/1kTlYbs28MXw...,1kTlYbs28MXw7hwO0NLYif,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,57,https://p.scdn.co/mp3-preview/260584b4f44684fa...,17,track,spotify:track:0bCCMwWTaYOcQ4v8EeEYmd,"['detroit hip hop', 'hip hop', 'pop', 'rap']",Eminem,Crazy In Love,en,English
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5Z01UMMf7V1o0MzF86s6WJ,2016-05-24T09:41:01Z,compilation,compilation,[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT...",https://open.spotify.com/album/5qENHeCSlwWpEzb...,https://api.spotify.com/v1/albums/5qENHeCSlwWp...,5qENHeCSlwWpEzb25peRmQ,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,78,https://p.scdn.co/mp3-preview/f943c5b5954918a2...,6,track,spotify:track:5Z01UMMf7V1o0MzF86s6WJ,"['detroit hip hop', 'hip hop', 'pop', 'rap']",Eminem,"Lose Yourself - From ""8 Mile"" Soundtrack",en,English
4xkOaSrkexMciUUogZKVTS,2016-05-24T09:40:58Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT...",https://open.spotify.com/album/2cWBwpqMsDJC1ZU...,https://api.spotify.com/v1/albums/2cWBwpqMsDJC...,2cWBwpqMsDJC1ZUwz813lo,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,86,https://p.scdn.co/mp3-preview/646f0a143151af01...,18,track,spotify:track:4xkOaSrkexMciUUogZKVTS,"['detroit hip hop', 'hip hop', 'pop', 'rap']",Eminem,Till I Collapse,en,English
0pN1Ld0Fr9DTUaCGbKRvlF,2016-05-24T09:40:11Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,['MX'],https://open.spotify.com/album/63WdJvk8G9hxJn8...,https://api.spotify.com/v1/albums/63WdJvk8G9hx...,63WdJvk8G9hxJn8u5rswNh,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,29,https://p.scdn.co/mp3-preview/206a9910606c7df9...,10,track,spotify:track:0pN1Ld0Fr9DTUaCGbKRvlF,"['canadian hip hop', 'canadian pop', 'hip hop'...",Drake,Make Me Proud,en,English
7kfTqGMzIHFWeBeOJALzRf,2016-05-24T09:39:42Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,['MX'],https://open.spotify.com/album/63WdJvk8G9hxJn8...,https://api.spotify.com/v1/albums/63WdJvk8G9hx...,63WdJvk8G9hxJn8u5rswNh,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,38,https://p.scdn.co/mp3-preview/570ddea0adb2c0bf...,3,track,spotify:track:7kfTqGMzIHFWeBeOJALzRf,"['canadian hip hop', 'canadian pop', 'hip hop'...",Drake,Headlines,en,English


In [78]:
CORPUS_english = pd.read_csv(data_dir+'CORPUS_english.csv')
CORPUS_english

Unnamed: 0,artist,song,lyrics,language,language_name,song_id,track.id
0,Paramore,This Is Why,[Verse 1]\nIf you have an opinion\nMaybe you s...,en,English,7HdXRMw14roDx2a0COWk3M,7HdXRMw14roDx2a0COWk3M
1,Taylor Swift,Better Man (Taylor's Version) (From The Vault),[Verse 1]\nI know I'm probably better off on m...,en,English,4OmFmE0fzcMG6g0Y8p4eSD,4OmFmE0fzcMG6g0Y8p4eSD
2,FYLOW,Give It To Me - Sped Up Version,[Verse 1: ReyTheStinger]\nI was scrolling down...,en,English,5ZjFa8NE9MHKBPNefxIh88,5ZjFa8NE9MHKBPNefxIh88
3,Woody Guthrie,Union Maid,"There once was a union maid, she never was afr...",en,English,171SFtWNviZ7Tp10zxNdpN,171SFtWNviZ7Tp10zxNdpN
4,Eminem,Crazy In Love,[Intro: Heart (Sampled)]\nTell myself that I w...,en,English,0bCCMwWTaYOcQ4v8EeEYmd,0bCCMwWTaYOcQ4v8EeEYmd
...,...,...,...,...,...,...,...
1442,Eminem,"Lose Yourself - From ""8 Mile"" Soundtrack",1. Eminem- Higher\n2. Eminem- Gnat\n3. Eminem-...,en,English,5Z01UMMf7V1o0MzF86s6WJ,5Z01UMMf7V1o0MzF86s6WJ
1443,Eminem,Till I Collapse,[Intro: Eminem]\n'Cause sometimes you just fee...,en,English,4xkOaSrkexMciUUogZKVTS,4xkOaSrkexMciUUogZKVTS
1444,Drake,Make Me Proud,[Verse 1: Drake]\nI like a woman with a future...,en,English,0pN1Ld0Fr9DTUaCGbKRvlF,0pN1Ld0Fr9DTUaCGbKRvlF
1445,Drake,Headlines,[Verse 1]\nI might be too strung out on compli...,en,English,7kfTqGMzIHFWeBeOJALzRf,7kfTqGMzIHFWeBeOJALzRf


## 1.0 Define some helper functions

In [79]:
def makeBOW(CORPUS,bag_lvl) -> pd.DataFrame:
    """
    Takes a DataFrame that has been tokenized and has a vocabulary
    column named 'term_str'. Returns a dataframe of the word count 
    for each word in 'bag_lvl'. 

    Args:
        CORPUS (pd.DataFrame): DataFrame with a 'term_str' column
            and columns for different parsed levels of the corpus. 
        bag (int): Level of the desired BOW. Based on the 
            index of CORPUS. 

    Returns:
        pd.DataFrame: A DataFrame with a count of 
        how many times each term appeared in the specified 
        level. 
    """
    bag = CORPUS.columns[:bag_lvl].tolist()
    bag.append('term_str')
    BOW = CORPUS.groupby(bag).term_str.count().to_frame('n')
    
    return BOW

In [80]:
def makeTFIDF(BOW,tf_method='max',idf_method='standard'):
    
    
    
    # Create Document-Term Count Matrix from BOW
    DTCM = BOW.n.unstack().fillna(0).astype('int')
    
    N = DTCM.shape[0] # Number of documents/bags in the BOW
    
    # Compute TF
    print('TF method:', tf_method)
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log':
        TF = np.log2(1 + DTCM.T)
    elif tf_method == 'raw':
        TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
    
    # Compute DF
    DF = DTCM.astype('bool').sum()
    
    # Compute IDF
    print('IDF method:', idf_method)
    if idf_method == 'standard':
        IDF = np.log2(N / DF)
    elif idf_method == 'max':
        IDF = np.log2(DF.max() / DF) 
    elif idf_method == 'smooth':
        IDF = np.log2((1 + N) / (1 + DF)) + 1

    TFIDF = TF * IDF
    
    # Added a calculation for the DFIDF
    DFIDF = DF*IDF
    
    return TF,DF, IDF, TFIDF,DFIDF

In [81]:
def vis_pcs(M, a, b, label='author', hover_name='doc', symbol=None, size=None):
    fig = px.scatter(M, f"PC{a}", f"PC{b}", color=label, hover_name=hover_name, 
                     symbol=symbol, size=size,
                     marginal_x='box', height=800)
    fig.show()

In [82]:
def vis_loadings(a=0, b=1, hover_name='term_str'):
    X = LOADINGS.join(VOCAB)
    return px.scatter(X.reset_index(), f"PC{a}", f"PC{b}", 
                      text='term_str', size='i', color='max_pos', 
                      marginal_x='box', height=800)

In [83]:
def computePCA(X,k,norm_docs,center_by_mean,center_by_variance,LIB):
    if norm_docs == True:
        X = (X.T / norm(X, 2, axis=1)).T
    if center_by_mean == True:
        X = X - X.mean()
    if center_by_variance == True:
        X = (X - X.mean()) / X.std()

    COV = X.T.dot(X) / (X.shape[0] - 1)
    

    
    eig_vals, eig_vecs = eigh(COV)
    EIG_VEC = pd.DataFrame(eig_vecs, index=COV.index, columns=COV.index)
    EIG_VAL = pd.DataFrame(eig_vals, index=COV.index, columns=['eig_val'])
    EIG_VAL.index.name = 'term_str'
    EIG_VEC_PAIRS = EIG_VEC.stack().sort_values(ascending=False).to_frame('covariance')
    EIG_VEC_PAIRS.index.names = ['term1', 'term2']
    EIG_PAIRS = EIG_VAL.join(EIG_VEC.T)
    EIG_PAIRS.sort_values('eig_val', ascending=False).head(10)
    EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)
    COMPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(k).reset_index(drop=True)
    COMPS.index.name = 'comp_id'
    COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
    COMPS.index.name = 'pc_id'
    LOADINGS = COMPS[COV.index].T
    LOADINGS.index.name = 'term_str'
    DCM = X.dot(COMPS[COV.index].T)
    DCM = DCM.join(LIB[LIB_COLS], on='track.id')
    DCM['doc'] = DCM.apply(lambda x: f"{x.song} {str(x.artist[1]).zfill(2)}", 1)
    
    
    return LOADINGS, DCM, COMPS
    

## 2.0

## 2.1 Generate a Bag of Words

In [84]:
BOW = makeBOW(TOKENS_english,1)
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,n
track.id,term_str,Unnamed: 2_level_1
00Mb3DuaIH1kjrwOku9CGU,a,14
00Mb3DuaIH1kjrwOku9CGU,about,2
00Mb3DuaIH1kjrwOku9CGU,after,2
00Mb3DuaIH1kjrwOku9CGU,all,3
00Mb3DuaIH1kjrwOku9CGU,alone,1
...,...,...
7zb1n2Rdmh5arJ3WOp2Z5c,to,1
7zb1n2Rdmh5arJ3WOp2Z5c,when,1
7zb1n2Rdmh5arJ3WOp2Z5c,which,1
7zb1n2Rdmh5arJ3WOp2Z5c,witness,1


## 2.2 Compute TFIDF and VOCAB['dfidf'] for the CORPUS using the following parameters:

In [85]:
# tf_method = 'max'
# idf_method = 'standard'
TF, DF, IDF, TFIDF, DFIDF = makeTFIDF(BOW,tf_method='max',idf_method='standard')

TF method: max
IDF method: standard


## 2.3 Create a DOC table from the TFIDF index in which each row represents a bag, i.e. a chapter

In [86]:
LIB_COLS = ['artist','song','genre','language_name']
LIB_english[LIB_COLS]

Unnamed: 0_level_0,artist,song,genre,language_name
track.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7HdXRMw14roDx2a0COWk3M,Paramore,This Is Why,"['candy pop', 'modern rock', 'pixie', 'pop', '...",English
4OmFmE0fzcMG6g0Y8p4eSD,Taylor Swift,Better Man (Taylor's Version) (From The Vault),['pop'],English
5ZjFa8NE9MHKBPNefxIh88,FYLOW,Give It To Me - Sped Up Version,[],English
171SFtWNviZ7Tp10zxNdpN,Woody Guthrie,Union Maid,"['american folk revival', 'folk', 'protest fol...",English
0bCCMwWTaYOcQ4v8EeEYmd,Eminem,Crazy In Love,"['detroit hip hop', 'hip hop', 'pop', 'rap']",English
...,...,...,...,...
5Z01UMMf7V1o0MzF86s6WJ,Eminem,"Lose Yourself - From ""8 Mile"" Soundtrack","['detroit hip hop', 'hip hop', 'pop', 'rap']",English
4xkOaSrkexMciUUogZKVTS,Eminem,Till I Collapse,"['detroit hip hop', 'hip hop', 'pop', 'rap']",English
0pN1Ld0Fr9DTUaCGbKRvlF,Drake,Make Me Proud,"['canadian hip hop', 'canadian pop', 'hip hop'...",English
7kfTqGMzIHFWeBeOJALzRf,Drake,Headlines,"['canadian hip hop', 'canadian pop', 'hip hop'...",English


In [87]:
DOC = pd.merge(pd.DataFrame(index=TFIDF.index),LIB_english[LIB_COLS],how='outer',on=['track.id'])
DOC = DOC.set_index(TFIDF.index)
DOC

Unnamed: 0_level_0,artist,song,genre,language_name
track.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00Mb3DuaIH1kjrwOku9CGU,Avril Lavigne,Sk8er Boi,"['canadian pop', 'candy pop', 'dance pop', 'po...",English
017PF4Q3l4DBUiWoXk4OWT,Dua Lipa,Break My Heart,"['dance pop', 'pop', 'uk pop']",English
01JszWYuKqRjddPQ4creVF,Metallica,Hardwired,"['hard rock', 'metal', 'old school thrash', 'r...",English
01oPNCtTniFT3YM4K3ksTf,Chiddy Bang,Opposite of Adults,"['indie pop rap', 'philly rap', 'pop rap']",English
01pOtDU5YHWbxuNBzlRUem,Linkin Park,Waiting for the End,"['alternative metal', 'modern rock', 'nu metal...",English
...,...,...,...,...
7xuhVUJmBpHR276Yc7AsgW,Sum 41,Walking Disaster,"['alternative metal', 'canadian pop punk', 'ca...",English
7yBEKiv1LmO7GM4rOejhcW,ThxSoMch,SPIT IN MY FACE!,[],English
7yoBjKO6cCnK3zV8gr1k4e,Highly Suspect,Lydia,"['modern alternative rock', 'modern rock', 'po...",English
7zQ5nqAKKfk0gtBgV70gyq,Papa Roach,Forever,"['alternative metal', 'nu metal', 'rap metal',...",English


## 1.5 Create a reduced version of the TFIDF table with only the top 1000 nouns (i.e. NN and NNS) in descending order of DFIDF.
***Do not "collapse" table -- keep the index as (book_id, chap_id).***

In [88]:
smallTFIDF = TFIDF
smallTFIDF

term_str,0,00,000,00000219,0001,001,002,003,004,006,...,zurp,zuu,zw1tch,zwaan,zwieback,zy,zyrius,zz,zzzzrrrrrr,zzzzzzs
track.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00Mb3DuaIH1kjrwOku9CGU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
017PF4Q3l4DBUiWoXk4OWT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01JszWYuKqRjddPQ4creVF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01oPNCtTniFT3YM4K3ksTf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01pOtDU5YHWbxuNBzlRUem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7xuhVUJmBpHR276Yc7AsgW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7yBEKiv1LmO7GM4rOejhcW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7yoBjKO6cCnK3zV8gr1k4e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7zQ5nqAKKfk0gtBgV70gyq,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
relevant_pos = ['NN', 'NNS']
smallVOCAB = VOCAB_english[VOCAB_english['max_pos'].isin(relevant_pos)]
relevant_terms = list(smallVOCAB.index)
DFIDF_df = DFIDF.loc[relevant_terms].sort_values(ascending=False).to_frame("DFIDF")
TFIDF_cols = DFIDF_df.index[:1000]
smallTFIDF = TFIDF[TFIDF_cols]
smallTFIDF

term_str,cause,love,youre,yeah,way,time,life,night,day,gon,...,performance,twist,argument,concern,midst,salt,families,stress,flower,shelter
track.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00Mb3DuaIH1kjrwOku9CGU,0.000000,0.105265,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
017PF4Q3l4DBUiWoXk4OWT,0.398475,0.294742,0.000000,0.038399,0.000000,0.092580,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01JszWYuKqRjddPQ4creVF,0.000000,0.000000,0.000000,0.000000,0.384712,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01oPNCtTniFT3YM4K3ksTf,0.043909,0.044658,0.000000,0.139632,0.046632,0.000000,0.222675,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01pOtDU5YHWbxuNBzlRUem,0.115920,0.000000,0.000000,0.368629,0.000000,0.000000,0.220449,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7xuhVUJmBpHR276Yc7AsgW,0.025421,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7yBEKiv1LmO7GM4rOejhcW,0.000000,0.751303,0.081938,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7yoBjKO6cCnK3zV8gr1k4e,0.063000,0.192223,0.060563,0.000000,0.000000,0.000000,0.000000,0.085204,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7zQ5nqAKKfk0gtBgV70gyq,0.000000,0.000000,0.087059,0.000000,0.000000,0.385752,0.114817,0.122481,0.124689,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
# check for NaNs

In [92]:
LOADINGS, DCM, COMPINF = computePCA(X=smallTFIDF,
                                    k=10,
                                    norm_docs=False,
                                    center_by_mean=False,
                                    center_by_variance=False,
                                    LIB=LIB_english)

In [93]:
LOADINGS

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
cause,0.008903,-0.127864,0.016620,0.014607,-0.016566,-0.007346,-0.018684,-0.016594,0.017033,-0.038236
love,0.053381,-0.167584,0.039805,0.038407,-0.035441,0.007399,-0.087655,-0.006407,0.113499,-0.061329
youre,0.013890,-0.164386,0.037392,0.040879,-0.110301,0.023182,-0.122552,0.027999,0.015563,0.019335
yeah,0.014280,-0.255027,0.088667,-0.002659,-0.000005,0.019686,0.024461,0.216003,-0.229230,0.111833
way,0.014866,-0.115191,0.036113,0.015918,-0.049909,0.002281,-0.034567,0.028702,0.023542,-0.047460
...,...,...,...,...,...,...,...,...,...,...
salt,0.000231,-0.002198,-0.004572,-0.001536,-0.001936,0.000450,-0.000676,-0.000138,0.001536,-0.000417
families,0.000061,-0.000794,0.000258,-0.000322,0.000178,0.000114,-0.000345,-0.000646,0.000521,-0.000593
stress,0.000557,-0.008812,0.003255,-0.004491,0.001381,0.001412,-0.002908,-0.006417,-0.003439,-0.006724
flower,0.003395,-0.000769,0.000476,0.000560,-0.000748,-0.000102,0.000245,-0.000427,-0.000279,-0.000721


In [94]:
DCM

Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,artist,song,genre,language_name,doc
track.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
00Mb3DuaIH1kjrwOku9CGU,0.139046,-0.830505,0.299521,0.052544,-0.262367,-0.185968,0.388695,-0.373190,0.419629,0.041321,Avril Lavigne,Sk8er Boi,"['canadian pop', 'candy pop', 'dance pop', 'po...",English,Sk8er Boi 0v
017PF4Q3l4DBUiWoXk4OWT,0.062760,-0.337810,0.086060,0.088183,-0.139878,0.011870,-0.111229,0.008455,0.045361,-0.161789,Dua Lipa,Break My Heart,"['dance pop', 'pop', 'uk pop']",English,Break My Heart 0u
01JszWYuKqRjddPQ4creVF,0.045318,-0.358160,0.097800,-0.111183,0.050826,0.025064,-0.085622,-0.126995,0.046928,-0.218442,Metallica,Hardwired,"['hard rock', 'metal', 'old school thrash', 'r...",English,Hardwired 0e
01oPNCtTniFT3YM4K3ksTf,0.082006,-0.367462,0.109061,-0.049127,0.068354,-0.007209,-0.050798,0.042133,0.029812,-0.119457,Chiddy Bang,Opposite of Adults,"['indie pop rap', 'philly rap', 'pop rap']",English,Opposite of Adults 0h
01pOtDU5YHWbxuNBzlRUem,0.054686,-0.414664,0.097152,0.028517,-0.111669,0.101154,-0.164463,0.062862,-0.034848,-0.122460,Linkin Park,Waiting for the End,"['alternative metal', 'modern rock', 'nu metal...",English,Waiting for the End 0i
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7xuhVUJmBpHR276Yc7AsgW,0.016951,-0.105835,0.023531,0.022544,-0.044228,0.006460,-0.048527,-0.002909,-0.004504,-0.061619,Sum 41,Walking Disaster,"['alternative metal', 'canadian pop punk', 'ca...",English,Walking Disaster 0u
7yBEKiv1LmO7GM4rOejhcW,0.080325,-0.358106,0.104537,0.065126,-0.070622,0.007421,-0.144270,-0.018937,0.138021,-0.184490,ThxSoMch,SPIT IN MY FACE!,[],English,SPIT IN MY FACE! 0h
7yoBjKO6cCnK3zV8gr1k4e,0.062025,-0.297236,0.058548,0.061883,-0.116221,0.025090,-0.163584,-0.009258,0.055769,-0.141286,Highly Suspect,Lydia,"['modern alternative rock', 'modern rock', 'po...",English,Lydia 0i
7zQ5nqAKKfk0gtBgV70gyq,0.085256,-0.394695,0.064269,0.093907,-0.186615,0.035360,-0.196849,-0.014984,0.025775,-0.213562,Papa Roach,Forever,"['alternative metal', 'nu metal', 'rap metal',...",English,Forever 0a


In [95]:
COMPINF

Unnamed: 0_level_0,eig_val,cause,love,youre,yeah,way,time,life,night,day,...,twist,argument,concern,midst,salt,families,stress,flower,shelter,exp_var
pc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PC0,0.436115,0.008903,0.053381,0.01389,0.01428,0.014866,0.017286,0.022,0.014191,0.011143,...,0.003030637,0.000102,0.000248,4e-05,0.000231,6.1e-05,0.000557,0.003395,0.000927,6.36
PC1,0.19881,-0.127864,-0.167584,-0.164386,-0.255027,-0.115191,-0.134305,-0.098861,-0.099893,-0.086908,...,-0.00581807,-0.001799,-0.001911,-0.000459,-0.002198,-0.000794,-0.008812,-0.000769,-0.006126,2.9
PC2,0.132872,0.01662,0.039805,0.037392,0.088667,0.036113,0.028512,0.032272,0.032099,0.031524,...,0.002428524,0.000999,0.000672,0.000173,-0.004572,0.000258,0.003255,0.000476,-0.008974,1.94
PC3,0.093577,0.014607,0.038407,0.040879,-0.002659,0.015918,0.034174,0.001993,0.02132,-0.002397,...,3.746e-07,0.002923,0.000389,3.6e-05,-0.001536,-0.000322,-0.004491,0.00056,-0.002629,1.37
PC4,0.092356,-0.016566,-0.035441,-0.110301,-5e-06,-0.049909,-0.065248,-0.025986,-0.045037,-0.015174,...,0.0003716917,-0.004526,-0.001281,-0.0001,-0.001936,0.000178,0.001381,-0.000748,-0.004658,1.35
PC5,0.080935,-0.007346,0.007399,0.023182,0.019686,0.002281,0.018666,0.042946,0.005187,-0.004756,...,0.003855987,-0.001994,0.0004,8.2e-05,0.00045,0.000114,0.001412,-0.000102,0.002209,1.18
PC6,0.077004,-0.018684,-0.087655,-0.122552,0.024461,-0.034567,-0.068674,-0.073711,-0.066711,-0.014373,...,-0.002214136,0.007107,-0.002172,-0.000342,-0.000676,-0.000345,-0.002908,0.000245,-0.004109,1.12
PC7,0.067123,-0.016594,-0.006407,0.027999,0.216003,0.028702,0.001125,-0.021439,0.019925,-0.028596,...,8.029141e-06,-0.001168,-0.000616,-0.000147,-0.000138,-0.000646,-0.006417,-0.000427,0.001458,0.98
PC8,0.065823,0.017033,0.113499,0.015563,-0.22923,0.023542,0.003573,-0.008556,-0.007941,-0.0042,...,0.002289259,-8.2e-05,-0.00148,0.000299,0.001536,0.000521,-0.003439,-0.000279,0.000836,0.96
PC9,0.060754,-0.038236,-0.061329,0.019335,0.111833,-0.04746,-0.058696,-0.084814,-0.040939,-0.051958,...,-0.002691196,-0.000339,-0.001037,-0.000287,-0.000417,-0.000593,-0.006724,-0.000721,-0.005683,0.89


In [96]:
vis_pcs(DCM, 0, 1, label='artist')

In [67]:
DCM

Unnamed: 0_level_0,Unnamed: 1_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,artist,song,genre,language_name,doc
track.id,section_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
00Mb3DuaIH1kjrwOku9CGU,1,0.131923,-0.326016,-0.103259,-0.169635,-0.201329,0.000644,-0.220899,-0.226312,0.060950,0.035501,Avril Lavigne,Sk8er Boi,"['canadian pop', 'candy pop', 'dance pop', 'po...",English,Sk8er Boi 0v
00Mb3DuaIH1kjrwOku9CGU,2,0.205433,-0.420741,-0.221027,-0.214332,-0.162459,-0.113252,-0.075658,-0.094694,-0.013252,-0.086434,Avril Lavigne,Sk8er Boi,"['canadian pop', 'candy pop', 'dance pop', 'po...",English,Sk8er Boi 0v
00Mb3DuaIH1kjrwOku9CGU,3,0.153254,-0.487005,-0.110485,-0.305695,-0.201450,0.082401,-0.727555,-0.171860,0.123177,-0.087353,Avril Lavigne,Sk8er Boi,"['canadian pop', 'candy pop', 'dance pop', 'po...",English,Sk8er Boi 0v
00Mb3DuaIH1kjrwOku9CGU,4,0.291758,-0.688119,-0.162958,-0.344112,-0.236291,-0.231256,-0.520870,-0.442523,0.013263,-0.356374,Avril Lavigne,Sk8er Boi,"['canadian pop', 'candy pop', 'dance pop', 'po...",English,Sk8er Boi 0v
00Mb3DuaIH1kjrwOku9CGU,5,0.286072,-0.630110,-0.265492,-0.237562,-0.240705,-0.025237,-0.159399,-0.223477,0.084824,-0.008270,Avril Lavigne,Sk8er Boi,"['canadian pop', 'candy pop', 'dance pop', 'po...",English,Sk8er Boi 0v
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7zQ5nqAKKfk0gtBgV70gyq,5,0.076499,-0.393445,-0.250056,-0.064176,0.110872,-0.282586,0.005937,0.777991,0.706711,0.294073,Papa Roach,Forever,"['alternative metal', 'nu metal', 'rap metal',...",English,Forever 0a
7zQ5nqAKKfk0gtBgV70gyq,6,0.070511,-0.240989,-0.074396,-0.154754,-0.096933,-0.082898,-0.044280,0.258712,0.071679,-0.087844,Papa Roach,Forever,"['alternative metal', 'nu metal', 'rap metal',...",English,Forever 0a
7zQ5nqAKKfk0gtBgV70gyq,7,0.068487,-0.347974,-0.223082,-0.067416,0.084851,-0.243624,0.007217,0.675676,0.589440,0.247581,Papa Roach,Forever,"['alternative metal', 'nu metal', 'rap metal',...",English,Forever 0a
7zQ5nqAKKfk0gtBgV70gyq,8,0.088516,-0.461652,-0.290517,-0.059318,0.149905,-0.341030,0.004016,0.931463,0.882618,0.363812,Papa Roach,Forever,"['alternative metal', 'nu metal', 'rap metal',...",English,Forever 0a
