In [None]:
#TOPIC Modelling

In [None]:
!wget https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz
!tar -xzf nips12raw_str602.tgz

--2023-05-07 14:53:01--  https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz
Resolving cs.nyu.edu (cs.nyu.edu)... 216.165.22.203
Connecting to cs.nyu.edu (cs.nyu.edu)|216.165.22.203|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12851423 (12M) [application/x-gzip]
Saving to: ‘nips12raw_str602.tgz’


2023-05-07 14:53:02 (22.8 MB/s) - ‘nips12raw_str602.tgz’ saved [12851423/12851423]



In [None]:
!pip install tqdm
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import os
import numpy as np
import pandas as pd

DATA_PATH = 'nipstxt/'
print(os.listdir(DATA_PATH))

['RAW_DATA_NOTES', 'MATLAB_NOTES', 'idx', 'nips07', 'nips04', 'nips10', 'nips11', 'orig', 'nips02', 'nips12', 'nips05', 'nips03', 'nips01', 'nips08', 'nips06', 'nips00', 'nips09', 'README_yann']


In [None]:
folders = ["nips{0:02}".format(i) for i in range(0,13)]
papers = []
for folder in folders:
  file_names = os.listdir(DATA_PATH + folder)
  for file_name in file_names:
    file_name = DATA_PATH + folder + '/' + file_name
    with open(file_name,encoding='utf-8',mode='r+',errors='ignore') as file:
      text = file.read()
      papers.append(text)





In [None]:
print(len(papers))
print(papers[100][:10])

1740
671 
PROGR


#Let us do Basic Text Pre-processing

In [None]:
%%time
import nltk
import tqdm

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_text(papers):
  normalized_papers = []
  for paper in tqdm.tqdm(papers):
    paper = paper.lower()
    paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
    paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
    paper_tokens = [token for token in paper_tokens if len(token) > 1]
    paper_tokens = [token for token in paper_tokens if token not in stop_words]
    paper_tokens = list(filter(None,paper_tokens))
    if paper_tokens:
      normalized_papers.append(paper_tokens)
  
  return normalized_papers


norm_papers = normalize_text(papers)

print(norm_papers[0])


100%|██████████| 1740/1740 [00:35<00:00, 48.41it/s]

['patyep', 'class', 'degeneracy', 'unrestricted', 'storage', 'density', 'memory', 'christopher', 'scofield', 'douglas', 'reilly', 'charles', 'elbaum', 'leon', 'cooper', 'nestor', 'inc', 'richmond', 'square', 'providence', 'rhode', 'island', 'abstract', 'study', 'distributed', 'memory', 'system', 'ha', 'produced', 'number', 'model', 'work', 'well', 'limited', 'domain', 'however', 'recently', 'application', 'system', 'real', 'world', 'problem', 'ha', 'difficult', 'storage', 'limitation', 'inherent', 'architectural', 'serial', 'simulation', 'computational', 'complexity', 'recent', 'development', 'memory', 'unrestricted', 'storage', 'capacity', 'economical', 'feedforward', 'architecture', 'ha', 'opened', 'way', 'application', 'system', 'complex', 'pattern', 'recognition', 'problem', 'however', 'problem', 'sometimes', 'underspecified', 'feature', 'describe', 'environment', 'thus', 'significant', 'portion', 'pattern', 'environment', 'often', 'non', 'separable', 'review', 'current', 'work', '




#Create the bi-gram model

Before doing feature engineering and vectorization, it is time to extract some useful bi-gram based phrases from the text data and remove some unnecessary terms. We leverage the very useful gensim.models.Phrases class for this. This capability helps us automatically detect common phrases from a stream of sentences, which are typically multi-word expressions/word n-grams.

We leverage the min_count parameter, which tells us that our model ignores all words and bi-grams with total collected count lower than 20 across the corpus (of the input paper as a list of tokenized sentences). We also use a threshold of 20, which tells us that the model accepts specific phrases based on this threshold value so that a phrase of words a followed by b is accepted if the score of the phrase is greater than the threshold of 20. This threshold is dependent on the scoring parameter, which helps us understand how these phrases are scored to understand their influence.

In [None]:
import gensim
#bigram = gensim.models.Phrases(norm_papers, min_count=20, threshold=20,delimiter=b'_') # putting delimiter is giving error.
bigram = gensim.models.Phrases(norm_papers, min_count=20, threshold=20) # higher threshold fewer phrases.
bigram_model = gensim.models.phrases.Phraser(bigram)


print(bigram_model[norm_papers[0][:100]])


['patyep', 'class', 'degeneracy', 'unrestricted', 'storage', 'density', 'memory', 'christopher', 'scofield', 'douglas', 'reilly', 'charles', 'elbaum', 'leon', 'cooper', 'nestor', 'inc', 'richmond', 'square', 'providence', 'rhode', 'island', 'abstract', 'study', 'distributed', 'memory', 'system', 'ha', 'produced', 'number', 'model', 'work', 'well', 'limited', 'domain', 'however', 'recently', 'application', 'system', 'real_world', 'problem', 'ha', 'difficult', 'storage', 'limitation', 'inherent', 'architectural', 'serial', 'simulation', 'computational_complexity', 'recent', 'development', 'memory', 'unrestricted', 'storage_capacity', 'economical', 'feedforward', 'architecture', 'ha', 'opened', 'way', 'application', 'system', 'complex', 'pattern_recognition', 'problem', 'however', 'problem', 'sometimes', 'underspecified', 'feature', 'describe', 'environment', 'thus', 'significant', 'portion', 'pattern', 'environment', 'often', 'non', 'separable', 'review', 'current', 'work', 'high', 'dens

So, we can now build the normalized bi-gram corpus

In [None]:
norm_corpus_bigram = [bigram_model[doc] for doc in norm_papers]
dictionary = gensim.corpora.Dictionary(norm_corpus_bigram)

print('Sample word to number mappings:', list(dictionary.items())[:15])
print('Total Vocabulary Size:', len(dictionary))

Sample word to number mappings: [(0, '0o'), (1, '15n'), (2, '2b'), (3, '2n'), (4, '6introduced'), (5, 'ability'), (6, 'able'), (7, 'abstract'), (8, 'acad_sci'), (9, 'academic_press'), (10, 'according'), (11, 'accuracy'), (12, 'accurately'), (13, 'acknowi'), (14, 'acquire')]
Total Vocabulary Size: 78892


Now, find out unique words found in documents. Filter out commonly used words.

In [None]:
# Filter out words that occur less than 20 documents, or more than 60% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.6)
print('Total Vocabulary Size:', len(dictionary))

Total Vocabulary Size: 7756


In [None]:
# Transforming corpus into bag of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in norm_corpus_bigram]
print(bow_corpus[1][:50])

[(5, 1), (6, 2), (16, 1), (17, 2), (20, 1), (22, 2), (23, 1), (27, 2), (29, 4), (30, 1), (35, 3), (36, 4), (39, 4), (56, 3), (62, 1), (73, 2), (74, 1), (84, 1), (88, 1), (89, 1), (91, 40), (94, 3), (95, 5), (97, 3), (102, 5), (106, 2), (109, 14), (117, 1), (118, 4), (120, 1), (124, 1), (125, 3), (126, 3), (130, 1), (132, 1), (145, 1), (162, 1), (163, 1), (165, 7), (171, 1), (176, 1), (178, 1), (192, 8), (193, 2), (199, 2), (201, 2), (216, 2), (219, 1), (227, 3), (228, 6)]


In [None]:
print([(dictionary[idx] , freq) for idx, freq in bow_corpus[1][:50]])

[('abstract', 1), ('acad_sci', 2), ('addition', 1), ('additional', 2), ('allows', 1), ('always', 2), ('american_institute', 1), ('another', 2), ('application', 4), ('applied', 1), ('architecture', 3), ('area', 4), ('associative_memory', 4), ('cell', 3), ('choice', 1), ('cm', 2), ('collective', 1), ('computational', 1), ('computing', 1), ('condition', 1), ('connection', 40), ('consists', 3), ('constant', 5), ('contains', 3), ('convergence', 5), ('could', 2), ('current', 14), ('degree', 1), ('density', 4), ('describe', 1), ('development', 1), ('difficult', 3), ('dimension', 3), ('directly', 1), ('discus', 1), ('effect', 1), ('fact', 1), ('feature', 1), ('feedback', 7), ('finally', 1), ('flow', 1), ('found', 1), ('high', 8), ('hopfield', 2), ('implementation', 2), ('important', 2), ('interest', 2), ('investigation', 1), ('larger', 3), ('layer', 6)]


In [None]:
print('Total number of papers:', len(bow_corpus))

Total number of papers: 1740


#Find out TOPICS

In [None]:
%%time

TOTAL_TOPICS = 10
lsi_bow = gensim.models.LsiModel(bow_corpus, id2word=dictionary, num_topics=TOTAL_TOPICS,
                                 onepass=True, chunksize=1740, power_iters=1000)


CPU times: user 6min 33s, sys: 1min 46s, total: 8min 20s
Wall time: 5min 44s


In [None]:
for topic_id, topic in lsi_bow.print_topics(num_topics=10, num_words=20):
    print('Topic #'+str(topic_id+1)+':')
    print(topic)
    print()

Topic #1:
0.215*"unit" + 0.212*"state" + 0.187*"training" + 0.177*"neuron" + 0.162*"pattern" + 0.145*"image" + 0.140*"vector" + 0.125*"feature" + 0.122*"cell" + 0.110*"layer" + 0.101*"task" + 0.097*"class" + 0.091*"probability" + 0.089*"signal" + 0.087*"step" + 0.086*"response" + 0.085*"representation" + 0.083*"noise" + 0.082*"rule" + 0.081*"distribution"

Topic #2:
0.487*"neuron" + 0.396*"cell" + -0.257*"state" + 0.191*"response" + -0.187*"training" + 0.170*"stimulus" + 0.117*"activity" + -0.109*"class" + 0.099*"spike" + 0.097*"pattern" + 0.096*"circuit" + 0.096*"synaptic" + -0.095*"vector" + 0.090*"signal" + 0.090*"firing" + 0.088*"visual" + -0.084*"classifier" + -0.083*"action" + -0.078*"word" + 0.078*"cortical"

Topic #3:
-0.627*"state" + 0.395*"image" + -0.219*"neuron" + 0.209*"feature" + -0.188*"action" + 0.137*"unit" + 0.131*"object" + -0.130*"control" + 0.129*"training" + -0.109*"policy" + 0.103*"classifier" + 0.090*"class" + -0.081*"step" + -0.081*"dynamic" + 0.080*"classifica

In [None]:
for n in range(TOTAL_TOPICS):
    print('Topic #'+str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    for term, wt in lsi_bow.show_topic(n, topn=20):
        if wt >= 0:
            d1.append((term, round(wt, 3)))
        else:
            d2.append((term, round(wt, 3)))

    print('Direction 1:', d1)
    print('-'*50)
    print('Direction 2:', d2)
    print('-'*50)
    print()

Topic #1:
Direction 1: [('unit', 0.215), ('state', 0.212), ('training', 0.187), ('neuron', 0.177), ('pattern', 0.162), ('image', 0.145), ('vector', 0.14), ('feature', 0.125), ('cell', 0.122), ('layer', 0.11), ('task', 0.101), ('class', 0.097), ('probability', 0.091), ('signal', 0.089), ('step', 0.087), ('response', 0.086), ('representation', 0.085), ('noise', 0.083), ('rule', 0.082), ('distribution', 0.081)]
--------------------------------------------------
Direction 2: []
--------------------------------------------------

Topic #2:
Direction 1: [('neuron', 0.487), ('cell', 0.396), ('response', 0.191), ('stimulus', 0.17), ('activity', 0.117), ('spike', 0.099), ('pattern', 0.097), ('circuit', 0.096), ('synaptic', 0.096), ('signal', 0.09), ('firing', 0.09), ('visual', 0.088), ('cortical', 0.078)]
--------------------------------------------------
Direction 2: [('state', -0.257), ('training', -0.187), ('class', -0.109), ('vector', -0.095), ('classifier', -0.084), ('action', -0.083), ('w

LSI works with SVD ( Singular Value Decomsition) principle. M is typically known as the term-document matrix and is usually obtained after feature engineering on the preprocessed text data, where each row of the matrix represents a term and each column represents a text document.

U is known as the term-topic matrix where each row of the matrix represents a term and each column represents a topic. It’s useful for getting the influential terms for each topic when we multiply this by the singular values.

S is the matrix or array that consists of the list of singular values obtained after low-rank SVD, which is typically equal to the number of topics we decide prior to this operation.

VT is the topic-document matrix, which if you transpose, you get the document-topic matrix, which is useful in knowing how much influence each topic has on each document.

In [None]:
term_topic = lsi_bow.projection.u
singular_values = lsi_bow.projection.s
topic_document = (gensim.matutils.corpus2dense(lsi_bow[bow_corpus], len(singular_values)).T / singular_values).T
term_topic.shape, singular_values.shape, topic_document.shape


((7756, 10), (10,), (10, 1740))

We now have a term-topic matrix, singular values, and a topic-document matrix. We can transpose the topic-document matrix to form a documenttopic matrix and that would help us see the proportion of each topic per document (a larger proportion means the topic is more dominant in the document).

In [None]:
document_topics = pd.DataFrame(np.round(topic_document.T, 3), 
                               columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
document_topics.head(15)

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10
0,0.021,0.005,-0.006,-0.002,0.002,0.006,-0.014,0.04,0.002,0.002
1,0.028,0.05,-0.031,-0.016,-0.028,-0.067,-0.01,-0.006,0.058,-0.026
2,0.03,0.046,-0.011,-0.026,-0.057,-0.046,-0.017,0.009,-0.048,0.017
3,0.017,-0.005,-0.004,0.0,-0.015,-0.006,0.004,0.011,0.001,-0.038
4,0.02,-0.005,0.014,0.008,-0.018,-0.008,-0.047,-0.011,-0.014,-0.017
5,0.028,0.038,0.011,-0.01,0.016,0.071,-0.001,0.008,0.004,-0.01
6,0.017,0.02,-0.007,-0.013,-0.003,0.03,0.009,-0.009,0.035,-0.017
7,0.022,0.049,-0.011,-0.004,-0.013,0.031,-0.011,0.013,0.001,-0.013
8,0.029,0.052,0.001,0.021,-0.003,0.055,-0.041,0.052,0.019,0.013
9,0.016,0.004,-0.012,-0.001,-0.007,-0.015,0.012,0.007,0.003,-0.002


In [None]:
sample_paper_patterns = ['Feudal Reinforcement Learning \nPeter', 'Illumination-Invariant Face Recognition with a', 'Improved Hidden Markov Model Speech Recognition']
sample_paper_idxs = [idx for pattern in sample_paper_patterns 
                            for idx, content in enumerate(papers) 
                                if pattern in content]
sample_paper_idxs

[670, 728, 558]

In [None]:
document_numbers = sample_paper_idxs

for document_number in document_numbers:
    top_topics = list(document_topics.columns[np.argsort(-np.absolute(document_topics.iloc[document_number].values))[:3]])
    print('Document #'+str(document_number)+':')
    print('Dominant Topics (top 3):', top_topics)
    print('Paper Summary:')
    print(papers[document_number][:500])
    print()

Document #670:
Dominant Topics (top 3): ['T5', 'T3', 'T10']
Paper Summary:
Feudal Reinforcement Learning 
Peter Dayan 
CNL 
The Salk Institute 
PO Box 85800 
San Diego CA 92186-5800, USA 
dayanhelmholtz. sdsc. edu 
Geoffrey E Hinton 
Department of Computer Science 
University of Toronto 
6 Kings College Road, Toronto, 
Canada M5S 1A4 
hintonai. toronto. edu 
Abstract 
One way to speed up reinforcement learning is to enable learning to 
happen simultaneously at multiple resolutions in space and time. 
This paper shows how to create a Q-learning managerial hierarchy 
i

Document #728:
Dominant Topics (top 3): ['T4', 'T3', 'T5']
Paper Summary:
Illumination-Invariant Face Recognition with a 
Contrast Sensitive Silicon Retina 
Joachim M. Buhmann 
Rheinische Friedrich-Wilhelms-Universitfit 
Institut ftir Informatik II, R6merstrage 164 
D-53117 Bonn, Germany 
Martin Lades 
Ruhr-Universitfit Bochum 
Institut ftir Neuroinformatik 
D-44780 Bochum, Germany 
Frank Eeckman 
Lawrence Livermore Nat

#LEt us now Build the LSI model from beginning for our corpus

Based on what we mentioned earlier, the heart of LSI models involves Singular Value Decomposition (SVD). Here, we try to implement an LSI topic model from scratch using low-rank SVD. The first step in SVD is to get the source matrix, which is typically a term-document matrix. We can obtain it from Gensim by converting the sparse Bag of Words representation into a dense matrix.

In [None]:
td_matrix = gensim.matutils.corpus2dense(corpus=bow_corpus, num_terms=len(dictionary))
print(td_matrix.shape)
td_matrix

(7756, 1740)


array([[2., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]], dtype=float32)

In [None]:
vocabulary = np.array(list(dictionary.values()))
print('Total vocabulary size:', len(vocabulary))
vocabulary

Total vocabulary size: 7756


array(['0o', '2b', '2n', ..., 'support_vector', 'mozer_jordan',
       'kearns_solla'], dtype='<U28')

In [None]:
vocabulary[100]

'contrast'

Now perform the singular Value Decomposition

In [None]:
from scipy.sparse.linalg import svds

u, s, vt = svds(td_matrix, k=TOTAL_TOPICS, maxiter=10000)
term_topic = u
singular_values = s
topic_document = vt
term_topic.shape, singular_values.shape, topic_document.shape


((7756, 10), (10,), (10, 1740))

In [None]:
print(singular_values)

[ 364.19394  366.74716  385.9672   418.6218   432.89566  489.67987
  498.0195   580.7572   628.6302  1215.5349 ]


In [None]:
tt_weights = term_topic.transpose() * singular_values[:, None]
tt_weights.shape

(10, 7756)

In [None]:
top_terms = 20
topic_key_term_idxs = np.argsort(-np.absolute(tt_weights), axis=1)[:, :top_terms]
topic_keyterm_weights = np.array([tt_weights[row, columns] 
                             for row, columns in list(zip(np.arange(TOTAL_TOPICS), topic_key_term_idxs))])
topic_keyterms = vocabulary[topic_key_term_idxs]
topic_keyterms_weights = list(zip(topic_keyterms, topic_keyterm_weights))
for n in range(TOTAL_TOPICS):
    print('Topic #'+str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    terms, weights = topic_keyterms_weights[n]
    term_weights = sorted([(t, w) for t, w in zip(terms, weights)], 
                          key=lambda row: -abs(row[1]))
    for term, wt in term_weights:
        if wt >= 0:
            d1.append((term, round(wt, 3)))
        else:
            d2.append((term, round(wt, 3)))

    print('Direction 1:', d1)
    print('-'*50)
    print('Direction 2:', d2)
    print('-'*50)
    print()

Topic #1:
Direction 1: [('word', 188.488), ('vector', 85.974), ('node', 54.374), ('recognition', 53.232), ('sequence', 50.351), ('circuit', 45.393), ('cell', 44.81), ('hmm', 34.086), ('character', 34.022), ('chip', 32.159), ('matrix', 32.093), ('structure', 30.993)]
--------------------------------------------------
Direction 2: [('training', -92.618), ('task', -80.732), ('pattern', -70.618), ('classifier', -56.989), ('control', -50.677), ('rule', -45.927), ('action', -41.202), ('neuron', -38.193)]
--------------------------------------------------

Topic #2:
Direction 1: [('word', 78.345), ('neuron', 69.793), ('stimulus', 63.234), ('feature', 53.819), ('distribution', 53.119), ('response', 30.954), ('state', 29.343), ('probability', 29.099), ('estimate', 28.908)]
--------------------------------------------------
Direction 2: [('node', -173.277), ('circuit', -93.0), ('chip', -73.594), ('classifier', -58.716), ('current', -55.844), ('voltage', -53.49), ('control', -51.707), ('rule', -4