# HW 07

```yaml
Course:   DS 5001
Module:   07 HW
Topic:    PCA from Scratch
Author:   Ryan Lipps
Date:     5 March 2023
```

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import plotly_express as px
import seaborn as sns
import configparser
from sklearn.decomposition import PCA
from scipy.linalg import norm, eigh

sns.set(style='ticks')

### Config

In [2]:
colors = "YlGnBu" 

In [3]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [4]:
OHCO = ['book_id','chap_id','para_num','sent_num','token_num']
data_prefix = 'novels/novels'

### Read files

In [5]:
LIB = pd.read_csv(f'{data_home}/{data_prefix}-LIB.csv').set_index('book_id')
CORPUS = pd.read_csv(f'{data_home}/{data_prefix}-CORPUS.csv').set_index(OHCO)

In [6]:
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
secretadversary,1,0,1,0,DT,the
secretadversary,1,0,1,1,NNP,young
secretadversary,1,0,1,2,NNP,adventurers
secretadversary,1,0,1,3,NNP,ltd
secretadversary,1,1,0,0,JJ,tommy


In [7]:
LIB.head()

Unnamed: 0_level_0,genre_id,author_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
secretadversary,d,christie
styles,d,christie
moonstone,d,collins
adventures,d,doyle
baskervilles,d,doyle


## Extract VOCAB from CORPUS

In [8]:
VOCAB = CORPUS\
    .term_str\
    .value_counts()\
    .to_frame('n')\
    .sort_index()
VOCAB.index_name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str', 'pos']].value_counts()\
    .unstack(fill_value=0)\
    .idxmax(1)
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,28533,1,0.019017,5.716586,DT
aback,9,5,6e-06,17.347005,NN
abaft,2,5,1e-06,19.51693,IN
abandon,44,7,2.9e-05,15.057499,VB
abandoned,68,9,4.5e-05,14.429467,VBN


## Functions

In [9]:
def create_bow(corpus, bag, item_type='term_str'):
    bow = corpus.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return bow

In [10]:
def get_tfidf(bow, tf_method='max', df_method='standard', item_type='term_str'):
            
    dtcm = bow.n.unstack(fill_value=0) # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        tf = (dtcm.T / dtcm.T.sum()).T
    elif tf_method == 'max':
        tf = (dtcm.T / dtcm.T.max()).T
    elif tf_method == 'log':
        tf = (np.log2(1 + dtcm.T)).T
    elif tf_method == 'raw':
        tf = dtcm
    elif tf_method == 'bool':
        tf = dtcm.astype('bool').astype('int')
    else:
        raise ValueError(f"tf method {tf_method} not found.")

    df = dtcm.astype('bool').sum()
    N_docs = len(dtcm)
    
    if df_method == 'standard':
        idf = np.log2(N_docs/df) # This what the students were asked to use
    elif df_method == 'textbook':
        idf = np.log2(N_docs/(df + 1))
    elif df_method == 'sklearn':
        idf = np.log2(N_docs/df) + 1
    elif df_method == 'sklearn_smooth':
        idf = np.log2((N_docs + 1)/(df + 1)) + 1
    else:
        raise ValueError(f"df method {df_method} not found.")
    
    tfidf = tf * idf
    dfidf = df * idf

    return tfidf, dfidf

## Compute TFIDF and DFIDF

In [11]:
bag = ['book_id', 'chap_id']
tf_method = 'max'
idf_method = 'standard'
pos_list = ['NN', 'NNS']

In [12]:
TFIDF, DFIDF = get_tfidf(create_bow(CORPUS, bag), tf_method=tf_method, df_method=idf_method)
TFIDF.head()

Unnamed: 0_level_0,term_str,a,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,abate,...,zoöphagy,zufalle,zum,zuniga,zusammen,à,æt,ætat,ça,émeutes
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
adventures,1,0.0,0.0,0.0,0.0,0.006493,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
VOCAB['dfidf'] = DFIDF
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,28533,1,0.019017,5.716586,DT,0.0
aback,9,5,6e-06,17.347005,NN,46.368028
abaft,2,5,1e-06,19.51693,IN,8.321928
abandon,44,7,2.9e-05,15.057499,VB,98.408049
abandoned,68,9,4.5e-05,14.429467,VBN,124.513524


## Create DOC table from TFIDF Index

In [14]:
DOC = TFIDF.index.to_frame().drop(['book_id', 'chap_id'], axis=1)
DOC.head()

book_id,chap_id
adventures,1
adventures,2
adventures,3
adventures,4
adventures,5


In [15]:
DOC = DOC.join(LIB, on='book_id')
DOC.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,genre_id,author_id
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1
adventures,1,d,doyle
adventures,2,d,doyle
adventures,3,d,doyle
adventures,4,d,doyle
adventures,5,d,doyle


## Create reduced TFIDF

### Get top 1000 nouns by DFIDF

In [16]:
sig_terms = list(VOCAB.query(f'max_pos in {pos_list}')\
                 .sort_values('dfidf', ascending=False)[:1000]\
                 .index)
sig_terms[:10]

['yours',
 'reply',
 'order',
 'curiosity',
 'memory',
 'company',
 'feelings',
 'opportunity',
 'book',
 'spirit']

In [17]:
VSHORT = VOCAB[VOCAB.max_pos.isin(pos_list)].sort_values('dfidf', ascending=False).head(1000)
VSHORT.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
yours,198,5,0.000132,12.887574,NN,169.835635
reply,184,5,0.000123,12.993368,NN,169.835635
order,227,5,0.000151,12.690382,NN,169.835635
curiosity,208,9,0.000139,12.81649,NN,169.835635
memory,208,6,0.000139,12.81649,NN,169.826129


### Create reduced TFIDF

In [18]:
TFIDF_RED = TFIDF[sig_terms]
TFIDF_RED.head()

Unnamed: 0_level_0,term_str,yours,reply,order,curiosity,memory,company,feelings,opportunity,book,spirit,...,humanity,rank,contempt,apprehensions,owner,lad,enquiry,bag,investigation,inclination
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
adventures,1,0.006454,0.0,0.003227,0.003227,0.0064,0.0064,0.0,0.0,0.003282,0.003282,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007158,0.007158,0.0
adventures,2,0.009346,0.0,0.009346,0.0,0.006178,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00691,0.0,0.0,0.0,0.0
adventures,3,0.004089,0.008178,0.0,0.0,0.004054,0.008109,0.0,0.0,0.004159,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00907,0.0
adventures,4,0.002721,0.002721,0.005442,0.0,0.002698,0.0,0.0,0.002767,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.030176,0.0,0.0,0.006035,0.0
adventures,5,0.003043,0.0,0.003043,0.003043,0.003017,0.0,0.0,0.0,0.00619,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00675,0.0,0.0


## PCA Function

In [19]:
def get_PCA(X:pd.DataFrame, k:int, norm_docs=True, center_by_mean=False, center_by_variance=False) -> tuple:
    '''
    Function to compute PCA on a given document-term count matrix.

    PARAMETERS:

    X - pandas `DataFrame` representing document-term count matrix to perform PCA on

    k - int number of principal components to return

    norm_docs - bool (defaults to True) of whether to normalize document length. Uses L2 norm

    center_by_mean - bool (defaults to False) of whether to center term vectors by column mean. Uses pandas `.cov()` method

    center_by_variance - bool (defaults to False) of whether to center term vectors by column variance. Uses pandas `.cov()` method

    
    OUTPUTS:

    LOADINGS - pandas `DataFrame` representing term-component matrix

    DCM - pandas `DataFrame` representing document-component matrix

    COMPINF - pandas `DataFrame` representing component information
    '''

    # Input handling to ensure only one centering method is used
    if (center_by_mean and center_by_variance):
        raise SyntaxError('Cannot center by both mean and variance')
    
    # Normalize docs
    if (norm_docs):
        tfidf = (X.T/norm(X, 2, axis=1)).T
    else:
        tfidf = X
    
    # Center by mean
    if (center_by_mean):
        tfidf = tfidf - tfidf.mean()

    # Center by variance
    if (center_by_variance):
        tfidf = tfidf / tfidf.std()
    
    # Compute variance-covariange matrix
    COV = tfidf.cov()

    # Eigendecomposition
    eig_vals, eig_vecs = eigh(COV)

    # Convernt eigenvalues and eigenvectors into DataFrames
    EIG_VECS = pd.DataFrame(eig_vecs, index=COV.index, columns=COV.index)
    EIG_VALS = pd.DataFrame(eig_vals, index=COV.index, columns=['eig_val'])
    EIG_PAIRS = EIG_VALS.join(EIG_VECS.T)\
        .sort_values('eig_val', ascending=False)
    
    EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)

    # Select top k components
    COMPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(k).reset_index(drop=True)
    COMPS.index.name = 'comp_id'
    COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
    COMPS.index.name = 'pc_id'

    # Create LOADINGS
    LOADINGS = COMPS[COV.index].T
    LOADINGS.index.name = 'term_str'

    # Create DCM
    DCM = tfidf.dot(COMPS[COV.index].T)

    # Create COMPINF
    top_terms = []
    for i in range(k):
        for j in [0, 1]:
            comp_str = ' '.join(LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(10).index.to_list())
            top_terms.append((f"PC{i}", j, comp_str))
    COMPINF = pd.DataFrame(top_terms).set_index([0,1]).unstack()
    COMPINF.index.name = 'comp_id'
    COMPINF.columns = COMPINF.columns.droplevel(0) 
    COMPINF = COMPINF.rename(columns={0:'pos', 1:'neg'})

    return (LOADINGS, DCM, COMPINF)

## Compute PCA on reduced TFIDF

In [20]:
LOADINGS, DCM, COMPINF = get_PCA(TFIDF_RED, k=10, norm_docs=True, center_by_mean=False, center_by_variance=False)

In [21]:
LOADINGS.head(10).style.background_gradient(cmap=colors)

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
yours,0.019484,0.016559,-0.007586,-0.003938,-0.017489,0.025233,0.003145,-0.008342,-0.007351,0.005823
reply,-0.009054,0.007702,-0.010562,-0.019532,-0.012423,0.016606,-0.011956,0.013342,-0.02603,0.002001
order,-0.001659,0.001893,-0.033437,0.018477,-0.008462,0.032114,0.010416,-0.015883,0.018002,-0.000714
curiosity,-0.018668,0.012928,-0.005848,-0.010643,-0.002937,0.02711,0.005271,-0.002335,-0.007111,-0.030269
memory,-0.001513,-0.001008,0.018553,-0.013331,-0.008593,0.014673,-0.033355,0.016084,0.01658,0.017187
company,-0.010051,0.05359,-0.013281,-0.003445,0.008685,-0.023951,0.023795,-0.027801,-0.000176,0.021655
feelings,-0.029131,0.052076,0.078224,0.055563,-0.024234,-0.044121,-0.00789,0.023293,-0.039397,-0.053027
opportunity,-0.010072,0.035271,-0.022545,0.008012,0.002072,0.016822,-0.033068,-0.002615,0.000291,-0.005002
book,0.028254,0.027507,-0.022275,-0.020534,0.031183,-0.005973,-0.05509,-0.027695,0.009454,-0.006336
spirit,-0.018543,0.000952,0.014793,0.048656,-0.032448,-0.008732,-0.003245,-0.049302,-0.024987,-0.040819


In [22]:
DCM.head()

Unnamed: 0_level_0,pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
adventures,1,0.212571,-0.045779,-0.162873,0.049097,0.227527,0.156082,0.097577,-0.035834,0.012959,-0.043538
adventures,2,0.270641,-0.081908,-0.106468,0.021494,0.240485,0.129025,0.089229,0.085405,0.013677,0.016776
adventures,3,0.267175,0.067158,-0.105874,0.028059,0.185466,0.174991,0.081713,0.017269,0.035685,0.026514
adventures,4,0.138235,0.018818,-0.097508,0.10066,0.178027,0.119705,0.182594,0.047739,0.193241,0.073266
adventures,5,0.161811,-0.056081,-0.083252,0.085868,0.181365,0.160296,0.022165,0.119963,0.042116,0.134668


In [23]:
COMPINF.head()

1,pos,neg
comp_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PC0,thats youre shes cab lawyer girl doctor police...,chateau castle chamber woods convent apartment...
PC1,brother engagement father sister son letter da...,mountains woods sea rocks whilst castle road m...
PC2,chateau cottage woods sea mountains feelings s...,castle aunt chamber apartment lamp corridor se...
PC3,blood whilst child sleep monster misery soul t...,chateau thats aunt youre shes lawyer woods lad...
PC4,chateau cab dog inquiry letter police case mat...,thats youre shes guess youve girl oh castle th...


## Project Docs onto components

In [24]:
DCM = DCM.join(LIB, on='book_id')
DCM.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,genre_id,author_id
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
adventures,1,0.212571,-0.045779,-0.162873,0.049097,0.227527,0.156082,0.097577,-0.035834,0.012959,-0.043538,d,doyle
adventures,2,0.270641,-0.081908,-0.106468,0.021494,0.240485,0.129025,0.089229,0.085405,0.013677,0.016776,d,doyle
adventures,3,0.267175,0.067158,-0.105874,0.028059,0.185466,0.174991,0.081713,0.017269,0.035685,0.026514,d,doyle
adventures,4,0.138235,0.018818,-0.097508,0.10066,0.178027,0.119705,0.182594,0.047739,0.193241,0.073266,d,doyle
adventures,5,0.161811,-0.056081,-0.083252,0.085868,0.181365,0.160296,0.022165,0.119963,0.042116,0.134668,d,doyle


### Add 'doc' column to DCM

In [25]:
DCM['doc'] = DCM.apply(lambda x: f"{x.author_id}: {str(x.name[0])} {str(x.name[1]).zfill(2)}", 1)

In [26]:
DCM.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,genre_id,author_id,doc
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
adventures,1,0.212571,-0.045779,-0.162873,0.049097,0.227527,0.156082,0.097577,-0.035834,0.012959,-0.043538,d,doyle,doyle: adventures 01
adventures,2,0.270641,-0.081908,-0.106468,0.021494,0.240485,0.129025,0.089229,0.085405,0.013677,0.016776,d,doyle,doyle: adventures 02
adventures,3,0.267175,0.067158,-0.105874,0.028059,0.185466,0.174991,0.081713,0.017269,0.035685,0.026514,d,doyle,doyle: adventures 03
adventures,4,0.138235,0.018818,-0.097508,0.10066,0.178027,0.119705,0.182594,0.047739,0.193241,0.073266,d,doyle,doyle: adventures 04
adventures,5,0.161811,-0.056081,-0.083252,0.085868,0.181365,0.160296,0.022165,0.119963,0.042116,0.134668,d,doyle,doyle: adventures 05


## Graphing functions

In [27]:
def vis_pcs(M, a, b, label='author_id', hover_name='doc', symbol=None, size=None):
    return px.scatter(M, f"PC{a}", f"PC{b}", color=label, hover_name=hover_name, 
                     symbol=symbol, size=size,
                     marginal_x='box', height=800)

In [28]:
def vis_loadings(a=0, b=1, hover_name='term_str'):
    # X = LOADINGS.join(VOCAB)
    X = LOADINGS.join(VSHORT)
    return px.scatter(X.reset_index(), f"PC{a}", f"PC{b}", 
                      text='term_str', size='i', color='max_pos', 
                      marginal_x='box', height=800)

# HW Questions

## Question 1:
Looking at the documents plotted against the first principle component (PC), which genre has the more narrow range, i.e. distance between the minimum and maximum values? This can be seen using a box plot.

### Answer 1:
**Detective has a more narrow range**

In [29]:
fig = vis_pcs(DCM, 0, 1, label='genre_id')
fig

In [30]:
fig.write_image('q1_vis.pdf')

## Question 2:
Looking at the documents plotted against the first PC, which author has the highest absolute value, in terms of both mean and range? In other words, which author is farthest from 0? Again, the box plots of each author are useful here.

### Answer 2:
**Radcliffe has the highest absolute value for the first PC in terms of both mean and range**

In [31]:
fig = vis_pcs(DCM, 0, 1)
fig

In [32]:
fig.write_image('q2_vis.pdf')

## Question 3:
In the third PC, which author has, by far, the maximum range?

### Answer 3:
**Radcliffe has the largest range along the third PC**

In [33]:
fig = vis_pcs(DCM, 2, 1)
fig

In [34]:
fig.write_image('q3_vis.pdf')

## Question 4:
Looking at the loadings for the second PC, how would you characterize the opposition, based on the top three words at each pole?

### Answer 4:
**The opposition appears to be humans and human interaction vs. nature and setting**

In [35]:
fig = vis_loadings(1, 0)
fig

In [36]:
fig.write_image('q4_vis.pdf')

## Question 5:
Recompute the principle components with `center_by_variance` set to `True`. This will change the words that appear at the extremes of the first PC. Does this change your interpretation in the previous question?

### Answer 5:
**It does not change my interpretation of the previous question**

#### Recompute PCs

In [37]:
LOADINGS, _, _ = get_PCA(TFIDF_RED, k=10, norm_docs=True, center_by_mean=False, center_by_variance=True)

In [38]:
fig = vis_loadings(1, 0)
fig

In [39]:
fig.write_image('q5_vis.pdf')