# M07 Homework
- Name: Sam Remmey
- Net ID: sqr8ap
- URL of this file in GitHub: 

In [138]:
import pandas as pd
import numpy as np
from scipy.linalg import norm
from scipy.linalg import eigh
import plotly_express as px
import seaborn as sns
sns.set(style='ticks')

In [4]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [16]:
# Import data

OHCO = ['book_id','chap_id','para_num','sent_num','token_num']

LIB = pd.read_csv(f'{data_home}/novels-LIB.csv').set_index('book_id')
TOKEN = pd.read_csv(f'{data_home}/novels-CORPUS.csv').set_index(OHCO)

In [20]:
TOKEN.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
moonstone,77,7,2,7,PRP,himself
signoffour,8,73,0,1,PRP,i
udolpho,13,50,0,31,TO,to


In [22]:
LIB.sample(3)

Unnamed: 0_level_0,genre_id,author_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
scarlet,d,doyle
secretadversary,d,christie
styles,d,christie


In [26]:
# Extract a VOCAB table and add max_pos as a column

VOCAB = TOKEN.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = TOKEN[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [34]:
VOCAB.sample(3)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pealing,1,7,6.664823e-07,20.51693,VBG
gilt,1,4,6.664823e-07,20.51693,NN
cheer,35,5,2.332688e-05,15.387647,VB


In [129]:
# Compute TFIDF and VOCAB['dfidf'] for the CORPUS

def gen_bow(TOKENS, OHCO_LEVEL='CHAPS'):
    '''
    This function takes a tokens table and a choice of bag and returns a BOW representation in the form of a document-term count matrix. 

    Parameters
    TOKENS: tokens table; a dataframe
    OCHO_LEVEL: choice of bag; a string (either 'BOOKS', 'CHAPS', 'PARAS' or 'SENTS'); defaults to 'CHAPS'

    Returns
    DTCM: document-term count matrix
    '''
    OHCO = ['book_id','chap_id','para_num','sent_num','token_num']
    bags = dict(
        SENTS = OHCO[:4],
        PARAS = OHCO[:3],
        CHAPS = OHCO[:2],
        BOOKS = OHCO[:1])
    
    BOW = TOKENS.groupby(bags[OHCO_LEVEL]+['term_str']).term_str.count().to_frame('n')
    DTCM = BOW.n.unstack(fill_value=0)

    return DTCM

def gen_tfidf(DTCM, TF_METHOD='sum'):
    '''
    This function takes a BOW table (DTCM) and type of tf metric and returns the TFIDF/DFIDF values for the BOW. 

    Parameters
    DTCM: BOW table; a dataframe
    TF_METHOD: a string; either 'sum', 'max', 'log', 'raw', 'double_norm' or 'binary'; defaults to 'sum'

    Returns
    TFIDF: a dataframe
    DFIDF: a series
    '''

    tf_norm_k = 0.5
    idf_method = 'standard'
    gradient_cmap = 'YlGnBu'
    tf = {
        'sum': (DTCM.T / DTCM.T.sum()).T,
        'max': (DTCM.T / DTCM.T.max()).T,
        'log': (np.log2(1 + DTCM.T)).T,
        'raw':  DTCM,
        'double_norm': (DTCM.T / DTCM.T.max()).T,
        'binary': DTCM.T.astype('bool').astype('int').T}

    TF = tf[TF_METHOD]

    DF = DTCM.astype('bool').sum() 

    N = DTCM.shape[0]   
    
    IDF = np.log2(N / DF)

    TFIDF = TF * IDF
    DFIDF = DF * IDF
    
    return TFIDF, DFIDF

In [44]:
my_dtcm = gen_bow(TOKEN) # default bag level is chapter
idfs = gen_tfidf(my_dtcm, TF_METHOD = 'max')
TFIDF, DFIDF = idfs[0], idfs[1]

In [64]:
VOCAB['dfidf'] = DFIDF.values

In [68]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,28533,1,0.019017,5.716586,DT,0.0
aback,9,5,6e-06,17.347005,NN,46.368028
abaft,2,5,1e-06,19.51693,IN,8.321928
abandon,44,7,2.9e-05,15.057499,VB,98.408049
abandoned,68,9,4.5e-05,14.429467,VBN,124.513524


In [112]:
# Create a DOC table from the TFIDF index

TFIDF_reset = TFIDF.reset_index()
DOC = TFIDF_reset.merge(LIB, on='book_id', how='inner')
DOC = DOC.set_index(['book_id', 'chap_id'])
DOC.sample(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,abate,...,zum,zuniga,zusammen,à,æt,ætat,ça,émeutes,genre_id,author_id
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
adventures,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,d,doyle
monk,6,0.0,0.0,0.0,0.012246,0.010089,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,g,lewis
scarlet,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,d,doyle


In [136]:
# Create a reduced version of the TFIDF table with only the top 1000 nouns

top_nouns = VOCAB[VOCAB['max_pos'].isin(['NN', 'NNS'])].sort_values(by='dfidf', ascending=False).head(1000)

TFIDF_reduced = TFIDF[top_nouns.index]
TFIDF_reduced.sample(3)

Unnamed: 0_level_0,term_str,yours,reply,order,curiosity,memory,company,feelings,opportunity,book,spirit,...,humanity,rank,contempt,apprehensions,owner,lad,enquiry,bag,investigation,inclination
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
udolpho,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.019954,0.0,0.0,0.0,0.0,0.0
adventures,3,0.004089,0.008178,0.0,0.0,0.004054,0.008109,0.0,0.0,0.004159,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00907,0.0
scarlet,1,0.012197,0.0,0.012197,0.0,0.0,0.012094,0.0,0.0,0.0,0.012406,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Write a function that computes PCA from a given dtcm (TFIDF dataframe)

def gen_PCA(X, k, norm_docs=True, center_by_mean=False, center_by_variance=False):
    '''
    This function takes a document term count matrix as input and returns three dataframes: the term-component matrix, the document-component matrix, and the component information table.

    PARAMETERS
    X: the input matrix
    k: number of components to generate
    norm_docs: normalize doc vector lengths; True or False
    center_by_mean: True or False
    center_by_variance: True or False

    RETURNS
    LOADINGS: term-component matrix
    DCM: document-component matrix
    COMPINF: component information table
    '''

    if norm_docs:
        X = (X.T / norm(X, 2, axis=1)).T

    X = X.dropna() # remove nulls

    if center_by_mean:
        X = X - X.mean(axis=0)

    if center_by_variance:
        X = X / X.std(axis=0)

    # Compute covariance or correlation matrix
    if center_by_mean and center_by_variance:
        COV = np.corrcoef(X.to_numpy(), rowvar=False)  # Correlation matrix
    elif center_by_mean and not center_by_variance:
        COV = X.cov()  # Covariance matrix
    elif not center_by_mean and center_by_variance:
        COV = np.corrcoef(X.to_numpy(), rowvar=False)  # Correlation matrix
    else:
        COV = X.T.dot(X) / (X.shape[0] - 1)  # Uncentered covariance

    # Decompose the matrix
    eig_vals, eig_vecs = eigh(COV)

    # Convert eigen data to dataframes
    EIG_VEC = pd.DataFrame(eig_vecs, index=COV.index, columns=COV.index)
    EIG_VAL = pd.DataFrame(eig_vals, index=COV.index, columns=['eig_val'])
    EIG_VAL.index.name = 'term_str'

    # Combine eigenvalues and eigenvectors
    EIG_PAIRS = EIG_VAL.join(EIG_VEC.T)

    EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)

    # Pick top k components (COMPS)
    COMPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(k).reset_index(drop=True)
    COMPS.index.name = 'comp_id'
    COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
    COMPS.index.name = 'pc_id'

    # See projected components onto vocabulary (LOADINGS)
    LOADINGS = COMPS[COV.index].T
    LOADINGS.index.name = 'term_str'

    # Component information table?
    top_terms = []
    for i in range(k):
        for j in [0, 1]:
            comp_str = ' '.join(LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(k).index.to_list())
            top_terms.append((f"PC{i}", j, comp_str))
    COMP_GLOSS = pd.DataFrame(top_terms).set_index([0,1]).unstack()
    COMP_GLOSS.index.name = 'comp_id'
    COMP_GLOSS.columns = COMP_GLOSS.columns.droplevel(0) 
    COMP_GLOSS = COMP_GLOSS.rename(columns={0:'pos', 1:'neg'})

    # 

    return None