# HW 07

```yaml
Course:   DS 5001
Module:   07 HW
Topic:    PCA from Scratch
Author:   Ryan Lipps
Date:     5 March 2023
```

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import plotly_express as px
import seaborn as sns
import configparser
from sklearn.decomposition import PCA
from scipy.linalg import norm, eigh

sns.set(style='ticks')

### Config

In [2]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [3]:
OHCO = ['book_id','chap_id','para_num','sent_num','token_num']
data_prefix = 'novels/novels'

### Read files

In [4]:
LIB = pd.read_csv(f'{data_home}/{data_prefix}-LIB.csv').set_index('book_id')
CORPUS = pd.read_csv(f'{data_home}/{data_prefix}-CORPUS.csv').set_index(OHCO)

In [5]:
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
secretadversary,1,0,1,0,DT,the
secretadversary,1,0,1,1,NNP,young
secretadversary,1,0,1,2,NNP,adventurers
secretadversary,1,0,1,3,NNP,ltd
secretadversary,1,1,0,0,JJ,tommy


In [6]:
LIB.head()

Unnamed: 0_level_0,genre_id,author_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
secretadversary,d,christie
styles,d,christie
moonstone,d,collins
adventures,d,doyle
baskervilles,d,doyle


## Extract VOCAB from CORPUS

In [7]:
VOCAB = CORPUS\
    .term_str\
    .value_counts()\
    .to_frame('n')\
    .sort_index()
VOCAB.index_name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str', 'pos']].value_counts()\
    .unstack(fill_value=0)\
    .idxmax(1)
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,28533,1,0.019017,5.716586,DT
aback,9,5,6e-06,17.347005,NN
abaft,2,5,1e-06,19.51693,IN
abandon,44,7,2.9e-05,15.057499,VB
abandoned,68,9,4.5e-05,14.429467,VBN


## Functions

In [8]:
def create_bow(corpus, bag, item_type='term_str'):
    bow = corpus.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return bow

In [9]:
def get_tfidf(bow, tf_method='max', df_method='standard', item_type='term_str'):
            
    dtcm = bow.n.unstack(fill_value=0) # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        tf = (dtcm.T / dtcm.T.sum()).T
    elif tf_method == 'max':
        tf = (dtcm.T / dtcm.T.max()).T
    elif tf_method == 'log':
        tf = (np.log2(1 + dtcm.T)).T
    elif tf_method == 'raw':
        tf = dtcm
    elif tf_method == 'bool':
        tf = dtcm.astype('bool').astype('int')
    else:
        raise ValueError(f"tf method {tf_method} not found.")

    df = dtcm.astype('bool').sum()
    N_docs = len(dtcm)
    
    if df_method == 'standard':
        idf = np.log2(N_docs/df) # This what the students were asked to use
    elif df_method == 'textbook':
        idf = np.log2(N_docs/(df + 1))
    elif df_method == 'sklearn':
        idf = np.log2(N_docs/df) + 1
    elif df_method == 'sklearn_smooth':
        idf = np.log2((N_docs + 1)/(df + 1)) + 1
    else:
        raise ValueError(f"df method {df_method} not found.")
    
    tfidf = tf * idf
    dfidf = df * idf

    return tfidf, dfidf

## Compute TFIDF and DFIDF

In [10]:
bag = ['book_id', 'chap_id']
tf_method = 'max'
idf_method = 'standard'
pos_list = ['NN', 'NNS']

In [11]:
TFIDF, DFIDF = get_tfidf(create_bow(CORPUS, bag), tf_method=tf_method, df_method=idf_method)
TFIDF.head()

Unnamed: 0_level_0,term_str,a,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,abate,...,zoöphagy,zufalle,zum,zuniga,zusammen,à,æt,ætat,ça,émeutes
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
adventures,1,0.0,0.0,0.0,0.0,0.006493,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
VOCAB['dfidf'] = DFIDF
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,28533,1,0.019017,5.716586,DT,0.0
aback,9,5,6e-06,17.347005,NN,46.368028
abaft,2,5,1e-06,19.51693,IN,8.321928
abandon,44,7,2.9e-05,15.057499,VB,98.408049
abandoned,68,9,4.5e-05,14.429467,VBN,124.513524


## Create DOC table from TFIDF Index

In [13]:
DOC = TFIDF.index.to_frame().drop(['book_id', 'chap_id'], axis=1)
DOC.head()

book_id,chap_id
adventures,1
adventures,2
adventures,3
adventures,4
adventures,5


In [14]:
DOC = DOC.join(LIB, on='book_id')
DOC.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,genre_id,author_id
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1
adventures,1,d,doyle
adventures,2,d,doyle
adventures,3,d,doyle
adventures,4,d,doyle
adventures,5,d,doyle


## Create reduced TFIDF

### Get top 1000 nouns by DFIDF

In [15]:
sig_terms = list(VOCAB.query(f'max_pos in {pos_list}')\
                 .sort_values('dfidf', ascending=False)[:1000]\
                 .index)
sig_terms[:10]

['yours',
 'reply',
 'order',
 'curiosity',
 'memory',
 'company',
 'feelings',
 'opportunity',
 'book',
 'spirit']

### Create reduced TFIDF

In [16]:
TFIDF_RED = TFIDF[sig_terms]
TFIDF_RED.head()

Unnamed: 0_level_0,term_str,yours,reply,order,curiosity,memory,company,feelings,opportunity,book,spirit,...,humanity,rank,contempt,apprehensions,owner,lad,enquiry,bag,investigation,inclination
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
adventures,1,0.006454,0.0,0.003227,0.003227,0.0064,0.0064,0.0,0.0,0.003282,0.003282,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007158,0.007158,0.0
adventures,2,0.009346,0.0,0.009346,0.0,0.006178,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00691,0.0,0.0,0.0,0.0
adventures,3,0.004089,0.008178,0.0,0.0,0.004054,0.008109,0.0,0.0,0.004159,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00907,0.0
adventures,4,0.002721,0.002721,0.005442,0.0,0.002698,0.0,0.0,0.002767,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.030176,0.0,0.0,0.006035,0.0
adventures,5,0.003043,0.0,0.003043,0.003043,0.003017,0.0,0.0,0.0,0.00619,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00675,0.0,0.0


## PCA Function

In [17]:
def get_PCA(X:pd.DataFrame, k:int, norm_docs=True, center_by_mean=False, center_by_variance=False) -> tuple:
    '''
    Function to compute PCA on a given document-term count matrix.

    PARAMETERS:

    X - pandas `DataFrame` representing document-term count matrix to perform PCA on

    k - int number of principal components to return

    norm_docs - bool (defaults to True) of whether to normalize document length. Uses L2 norm

    center_by_mean - bool (defaults to False) of whether to center term vectors by column mean. Uses pandas `.cov()` method

    center_by_variance - bool (defaults to False) of whether to center term vectors by column variance. Uses pandas `.cov()` method

    
    OUTPUTS:

    LOADINGS - pandas `DataFrame` representing term-component matrix

    DCM - pandas `DataFrame` representing document-component matrix

    COMPINF - pandas `DataFrame` representing component information
    '''

    # Input handling to ensure only one centering method is used
    if (center_by_mean and center_by_mean):
        raise SyntaxError('Cannot center by both mean and variance')
    
    # Normalize docs
    if (norm_docs):
        tfidf = (X.T/norm(X, 2, axis=1)).T
    
    # Center by mean
    if (center_by_mean):
        tfidf = tfidf - tfidf.mean()

    # Center by variance
    if (center_by_variance):
        tfidf = tfidf - tfidf.var()
    
    # Compute variance-covariange matrix
    COV = tfidf.cov()

    # Eigendecomposition
    eig_vals, eig_vecs = eigh(COV)

    # Convernt eigenvalues and eigenvectors into DataFrames
    EIG_VALS = pd.DataFrame(eig_vals, index=COV.index, columns=COV.index)
    EIG_VECS = pd.DataFrame(eig_vecs, index=COV.index, columns=['eig_val'])
    EIG_PAIRS = EIG_VALS.join(EIG_VECS.T)\
        .sort_values('eig_val', ascending=False)
    
    EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)

    # Select top k components
    COMPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(k).reset_index(drop=True)
    COMPS.index.name = 'comp_id'
    COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
    COMPS.index.name = 'pc_id'

    # Create LOADINGS
    LOADINGS = COMPS[COV.index].T
    LOADINGS.index.name = 'term_str'

    # Create DCM
    DCM = tfidf.dot(COMPS[COV.index].T)

    # Create COMPINF
    top_terms = []
    for i in range(k):
        for j in [0, 1]:
            comp_str = ' '.join(LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(10).index.to_list())
            top_terms.append((f"PC{i}", j, comp_str))
    COMPINF = pd.DataFrame(top_terms).set_index([0,1]).unstack()
    COMPINF.index.name = 'comp_id'
    COMPINF.columns = COMPINF.columns.droplevel(0) 
    COMPINF = COMPINF.rename(columns={0:'pos', 1:'neg'})

    return (LOADINGS, DCM, COMPINF)

## Compute PCA on reduced TFIDF

In [18]:
LOADINGS, DCM, COMPINF = get_PCA(TFIDF_RED, k=10, norm_docs=True, center_by_mean=False, center_by_variance=False)

ValueError: Shape of passed values is (1000, 1), indices imply (1000, 1000)

In [23]:
TFIDF_RED.cov().head()

term_str,yours,reply,order,curiosity,memory,company,feelings,opportunity,book,spirit,...,humanity,rank,contempt,apprehensions,owner,lad,enquiry,bag,investigation,inclination
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
yours,0.0001000132,-1.764918e-07,-1.25772e-07,3.682638e-06,7.134164e-06,5.274207e-06,4e-06,-3.176987e-06,-2e-06,-3.062721e-06,...,-1.489538e-06,-1.748837e-06,-5.25527e-06,-3.052507e-07,-6.198219e-07,4.504165e-06,-1.443007e-06,2.636389e-06,-1.942881e-06,4.639567e-06
reply,-1.764918e-07,3.573979e-05,-1.408651e-06,3.668584e-06,1.389333e-06,2.156236e-06,1e-06,8.169608e-06,-4e-06,4.053249e-07,...,3.456159e-06,-1.388338e-06,-1.03109e-06,1.53802e-06,-7.348376e-07,-3.954859e-06,1.282695e-06,-3.029666e-06,-1.751783e-06,6.021762e-06
order,-1.25772e-07,-1.408651e-06,2.8518e-05,1.081703e-06,-1.688135e-06,3.393614e-06,-7e-06,7.504948e-07,-2e-06,-1.876337e-06,...,1.038866e-06,1.940294e-06,1.755319e-06,1.565295e-06,4.625993e-07,-1.277227e-06,1.033311e-06,6.139087e-07,1.60279e-06,3.658486e-07
curiosity,3.682638e-06,3.668584e-06,1.081703e-06,4.428164e-05,3.965357e-07,2.904558e-06,4e-06,2.872716e-06,-3e-06,6.496669e-06,...,5.15076e-07,-4.224221e-07,3.539361e-07,1.447964e-06,-9.112504e-07,-4.492766e-07,2.322165e-06,-3.445903e-06,-3.484834e-06,6.965228e-06
memory,7.134164e-06,1.389333e-06,-1.688135e-06,3.965357e-07,6.881639e-05,-3.468441e-07,-2e-06,-3.437811e-06,-2e-06,8.243869e-07,...,6.70131e-07,-1.431989e-07,-2.494438e-06,-2.269626e-06,-4.816166e-06,-1.180614e-06,6.22875e-07,-1.808317e-06,-8.262123e-07,-4.268784e-06


In [None]:
eigvals, eigvecs = eigh(TFIDF_RED.cov())