# British Literature SVD & NMF in Excel

In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import decomposition
from glob import glob
import os

  return f(*args, **kwds)


In [3]:
np.set_printoptions(suppress=True)

In [6]:
filenames = []
for folder in ["british-fiction-corpus"]:
    filenames.extend(glob("data/" + folder + "/*.txt"))

In [7]:
len(filenames)

27

In [8]:
vectorizer = TfidfVectorizer(input='filename', stop_words='english')
dtm = vectorizer.fit_transform(filenames).toarray()
vocab = np.array(vectorizer.get_feature_names())
dtm.shape, len(vocab)

((27, 55035), 55035)

In [13]:
[f.split("/")[-1] for f in filenames]

['Dickens_Hard.txt',
 'Dickens_David.txt',
 'Austen_Sense.txt',
 'Austen_Emma.txt',
 'Trollope_Barchester.txt',
 'Eliot_Mill.txt',
 'Thackeray_Barry.txt',
 'Thackeray_Vanity.txt',
 'CBronte_Jane.txt',
 'Fielding_Tom.txt',
 'Thackeray_Pendennis.txt',
 'Sterne_Tristram.txt',
 'Fielding_Joseph.txt',
 'Richardson_Pamela.txt',
 'Dickens_Bleak.txt',
 'Trollope_Prime.txt',
 'Sterne_Sentimental.txt',
 'ABronte_Agnes.txt',
 'Austen_Pride.txt',
 'Eliot_Middlemarch.txt',
 'Eliot_Adam.txt',
 'CBronte_Villette.txt',
 'CBronte_Professor.txt',
 'EBronte_Wuthering.txt',
 'Trollope_Phineas.txt',
 'ABronte_Tenant.txt',
 'Richardson_Clarissa.txt']

# NMF 

In [14]:
clf = decomposition.NMF(n_components=10, random_state=10)
W1 = clf.fit_transform(dtm)
H1 = clf.components_

In [15]:
num_top_words = 8

In [16]:
def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [36]:
def get_all_topic_words(H):
    top_indices = lambda t: {i for i in np.argsort(t)[:-num_top_words-1:-1]}
    topic_indices = [top_indices(t) for t in H]
    return sorted(set.union(*topic_indices))

In [37]:
ind = get_all_topic_words(H1)

In [38]:
vocab[ind]

array(['adams', 'allworthy', 'bounderby', 'brandon', 'catherine', 'cathy',
       'crawley', 'darcy', 'dashwood', 'did', 'dorothea', 'earnshaw',
       'edgar', 'elinor', 'elizabeth', 'elton', 'emma', 'ferrars', 'finn',
       'glegg', 'good', 'gradgrind', 'hareton', 'heathcliff', 'jennings',
       'jones', 'joseph', 'knightley', 'know', 'lady', 'laura', 'like',
       'linton', 'little', 'll', 'lopez', 'louisa', 'lydgate', 'lyndon',
       'maggie', 'major', 'man', 'marianne', 'micawber', 'miss', 'mr',
       'mrs', 'old', 'osborne', 'pendennis', 'philip', 'phineas', 'said',
       'sissy', 'sophia', 'sparsit', 'stephen', 'time', 'toby', 'tom',
       'tulliver', 'wakem', 'weston', 'wharton', 'willoughby'],
      dtype='<U31')

In [40]:
show_topics(H1)

['mr said lydgate mrs dorothea micawber little know',
 'said little like did time good toby know',
 'adams said jones lady allworthy sophia mr joseph',
 'mr darcy emma weston miss knightley elton elizabeth',
 'elinor marianne dashwood jennings willoughby mrs brandon ferrars',
 'heathcliff linton hareton catherine earnshaw cathy edgar ll',
 'bounderby gradgrind sparsit said mr sissy louisa stephen',
 'phineas said lopez mr finn man wharton laura',
 'maggie tulliver said tom glegg philip mr wakem',
 'crawley said pendennis osborne old little lyndon major']

In [41]:
W1.shape, H1[:, ind].shape

((27, 10), (10, 65))

# SVD

In [43]:
import fbpca

In [44]:
U, s, V = fbpca.pca(dtm, 10)

In [45]:
ind = get_all_topic_words(V)

In [46]:
len(ind)

47

In [47]:
vocab[ind]

array(['adams', 'allworthy', 'arabin', 'bingley', 'bounderby', 'crawley',
       'darcy', 'dashwood', 'eleanor', 'elinor', 'elizabeth', 'elton',
       'emma', 'finn', 'fleur', 'harding', 'hath', 'heathcliff',
       'hunsden', 'jennings', 'jones', 'joseph', 'knightley', 'lady',
       'laura', 'linton', 'lopez', 'lovelace', 'lyndon', 'maggie',
       'marianne', 'micawber', 'monsieur', 'mr', 'peggotty', 'pelet',
       'pendennis', 'phineas', 'proudie', 'said', 'slope', 'sophia',
       'toby', 'tulliver', 'uncle', 'weston', 'willoughby'], dtype='<U31')

In [48]:
show_topics(H1)

['mr said lydgate mrs dorothea micawber little know',
 'said little like did time good toby know',
 'adams said jones lady allworthy sophia mr joseph',
 'mr darcy emma weston miss knightley elton elizabeth',
 'elinor marianne dashwood jennings willoughby mrs brandon ferrars',
 'heathcliff linton hareton catherine earnshaw cathy edgar ll',
 'bounderby gradgrind sparsit said mr sissy louisa stephen',
 'phineas said lopez mr finn man wharton laura',
 'maggie tulliver said tom glegg philip mr wakem',
 'crawley said pendennis osborne old little lyndon major']