# HW 07

```yaml
Course:   DS 5001
Module:   07 HW
Topic:    PCA from Scratch
Author:   Ryan Lipps
Date:     5 March 2023
```

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import plotly_express as px
import seaborn as sns
import configparser
from sklearn.decomposition import PCA
from scipy.linalg import norm

sns.set(style='ticks')

### Config

In [2]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [5]:
OHCO = ['book_id','chap_id','para_num','sent_num','token_num']
data_prefix = 'novels/novels'

### Read files

In [6]:
LIB = pd.read_csv(f'{data_home}/{data_prefix}-LIB.csv').set_index('book_id')
CORPUS = pd.read_csv(f'{data_home}/{data_prefix}-CORPUS.csv').set_index(OHCO)

In [7]:
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
secretadversary,1,0,1,0,DT,the
secretadversary,1,0,1,1,NNP,young
secretadversary,1,0,1,2,NNP,adventurers
secretadversary,1,0,1,3,NNP,ltd
secretadversary,1,1,0,0,JJ,tommy


In [8]:
LIB.head()

Unnamed: 0_level_0,genre_id,author_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
secretadversary,d,christie
styles,d,christie
moonstone,d,collins
adventures,d,doyle
baskervilles,d,doyle


## Extract VOCAB from CORPUS

In [9]:
VOCAB = CORPUS\
    .term_str\
    .value_counts()\
    .to_frame('n')\
    .sort_index()
VOCAB.index_name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str', 'pos']].value_counts()\
    .unstack(fill_value=0)\
    .idxmax(1)
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,28533,1,0.019017,5.716586,DT
aback,9,5,6e-06,17.347005,NN
abaft,2,5,1e-06,19.51693,IN
abandon,44,7,2.9e-05,15.057499,VB
abandoned,68,9,4.5e-05,14.429467,VBN


## Functions

In [13]:
def create_bow(corpus, bag, item_type='term_str'):
    bow = corpus.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return bow

In [19]:
def get_tfidf(bow, tf_method='max', df_method='standard', item_type='term_str'):
            
    dtcm = bow.n.unstack(fill_value=0) # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        tf = (dtcm.T / dtcm.T.sum()).T
    elif tf_method == 'max':
        tf = (dtcm.T / dtcm.T.max()).T
    elif tf_method == 'log':
        tf = (np.log2(1 + dtcm.T)).T
    elif tf_method == 'raw':
        tf = dtcm
    elif tf_method == 'bool':
        tf = dtcm.astype('bool').astype('int')
    else:
        raise ValueError(f"tf method {tf_method} not found.")

    df = dtcm.astype('bool').sum()
    N_docs = len(dtcm)
    
    if df_method == 'standard':
        idf = np.log2(N_docs/df) # This what the students were asked to use
    elif df_method == 'textbook':
        idf = np.log2(N_docs/(df + 1))
    elif df_method == 'sklearn':
        idf = np.log2(N_docs/df) + 1
    elif df_method == 'sklearn_smooth':
        idf = np.log2((N_docs + 1)/(df + 1)) + 1
    else:
        raise ValueError(f"df method {df_method} not found.")
    
    tfidf = tf * idf
    dfidf = df * idf

    return tfidf, dfidf

## Compute TFIDF and DFIDF

In [30]:
bag = ['book_id', 'chap_id']
tf_method = 'max'
idf_method = 'standard'
pos_list = ['NN', 'NNS']

In [22]:
TFIDF, DFIDF = get_tfidf(create_bow(CORPUS, bag), tf_method=tf_method, df_method=idf_method)
TFIDF.head()

Unnamed: 0_level_0,term_str,a,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,abate,...,zoöphagy,zufalle,zum,zuniga,zusammen,à,æt,ætat,ça,émeutes
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
adventures,1,0.0,0.0,0.0,0.0,0.006493,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adventures,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
VOCAB['dfidf'] = DFIDF
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,28533,1,0.019017,5.716586,DT,0.0
aback,9,5,6e-06,17.347005,NN,46.368028
abaft,2,5,1e-06,19.51693,IN,8.321928
abandon,44,7,2.9e-05,15.057499,VB,98.408049
abandoned,68,9,4.5e-05,14.429467,VBN,124.513524


## Create DOC table from TFIDF Index

In [42]:
DOC = TFIDF.index.to_frame().drop(['book_id', 'chap_id'], axis=1)
DOC.head()

book_id,chap_id
adventures,1
adventures,2
adventures,3
adventures,4
adventures,5


In [54]:
DOC = DOC.join(LIB, on='book_id')
DOC.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,genre_id,author_id
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1
adventures,1,d,doyle
adventures,2,d,doyle
adventures,3,d,doyle
adventures,4,d,doyle
adventures,5,d,doyle


## Create reduced TFIDF

### Get top 1000 nouns by DFIDF

In [32]:
sig_terms = list(VOCAB.query(f'max_pos in {pos_list}')\
                 .sort_values('dfidf', ascending=False)[:1000]\
                 .index)
sig_terms[:10]

['yours',
 'reply',
 'order',
 'curiosity',
 'memory',
 'company',
 'feelings',
 'opportunity',
 'book',
 'spirit']