Python code experimenting with creating a corpus, and using TFIDF, LSA, LDA : modelling functions

In [45]:
# First steps: get the necessary packages, and load the text:
import pandas as pd
import nltk
from nltk import regexp_tokenize

from collections import Counter
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora.dictionary import Dictionary
from gensim import corpora, models, similarities

from collections import defaultdict
import itertools

import logging

texts_list = pd.read_csv("NLPexample.txt", header = None, names = ["Text"], delimiter = "\t")

# Alternatives:
#texts_list = pd.read_csv("NLPexample.txt", header = None, names = ["Text"], usecols = [0])
# OR, if the file doesn't contain commas:
# texts_list = pd.read_csv("NLPexample2.txt", header = None, names = ["Text"])


# what does it look like?
print(texts_list.head())

                                                Text
0  Human machine interface for lab abc computer a...
1  A survey of user opinion of computer system re...
2          The EPS user interface management system,
3  System and human system engineering testing of...
4  Relation of user perceived response time to er...


In [46]:
# Define functions for creating Ngrams

def sum_string(text, j, N):
    """
    Helper function for creating N-grams, which adjoins the next word in the text to
    the existing Ngram via "_"
    """
    
    if N > len(text) - j:
        raise ValueError("N is too long for the text")
    if N == 1:
        return text[j]
    return text[j] + "_" + sum_string(text, j+1, N - 1)


def join_tokens(doc, N = 2):
    """
    Convert a list of tokens into N-gram tokens
    the N-grams returned are the tokens
    """
    #construct ngrams from tokens
    if len(doc) > N - 1:
        for i in range(0,len(doc)- N + 1):
            yield sum_string(doc, i, N)

In [47]:
# Transform the data to lower case
from nltk.corpus import stopwords

lower_tokens = [t.lower() for t in texts_list.Text]



In [48]:
lower_tokens

['human machine interface for lab abc computer applications,',
 'a survey of user opinion of computer system response time,',
 'the eps user interface management system,',
 'system and human system engineering testing of eps,',
 'relation of user perceived response time to error measurement,',
 'the generation of random binary unordered trees,',
 'the intersection graph of paths in trees,',
 'graph minors iv widths of trees and well quasi ordering,',
 'graph minors a survey']

In [50]:
# Create a Gensim corpus to investigate word frequencies
# Creating a gensim corpus


tokenized_docs = [regexp_tokenize(doc, "\w+") for doc in lower_tokens]


stop_words = stopwords.words("english")
stop_words_add = ["consolidated", "detail", "details", "summary", "summaries"]
[stop_words.append(w) for w in stop_words_add]

# Tokenized_docs is a list of list, so it needs a "double" generator
cleaned_text = [[t for t in doc if t not in stop_words] for doc in tokenized_docs] 


cleaned_text



[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

--------------------------------------------
Create a function that takes as input:
* cleaned text
* n for ngram
* dimension of reduced space (NumTopics)
* how many words to include in topic Words

And produces as output:
DataFrame with Frequency, topicID, and topicWords.  Topics built by LSA

------------------------------------------------

In [60]:
N = 1
# Create the Ngrams
Ngrams_text = [list(join_tokens(doc, N)) for doc in cleaned_text]

# Create the gensim dictionary and corpus
# dictionary: creates a mapping between tokens in the Ngrams_text, and an integer id
# corpus:  uses the integer id to count number of occurrences of tokens in each document
dictionary = Dictionary(Ngrams_text)
    
corpus = [dictionary.doc2bow(doc) for doc in Ngrams_text]
corpus

2019-04-12 16:18:58,097 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-04-12 16:18:58,098 : INFO : built Dictionary(35 unique tokens: ['abc', 'applications', 'computer', 'human', 'interface']...) from 9 documents (total 52 corpus positions)


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(2, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(4, 1), (10, 1), (12, 1), (13, 1), (14, 1)],
 [(3, 1), (10, 2), (13, 1), (15, 1), (16, 1)],
 [(8, 1), (11, 1), (12, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(24, 1), (26, 1), (27, 1), (28, 1)],
 [(24, 1), (26, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)],
 [(9, 1), (26, 1), (30, 1)]]

In [62]:
# Create the defaultdict: total_word_count (i.e. initialize an empty dictionary)
total_word_count = defaultdict(int)

for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict, which counts number of occurrences of a word over all documents
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True)

    
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sorted_word_count

[(10, 4),
 (12, 3),
 (24, 3),
 (26, 3),
 (2, 2),
 (3, 2),
 (4, 2),
 (8, 2),
 (9, 2),
 (11, 2),
 (13, 2),
 (30, 2),
 (0, 1),
 (1, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (25, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1)]

Create an TFIDF Model with the text:

In [74]:
tfidf = models.TfidfModel(corpus) 

corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
     print(doc)

2019-04-12 16:53:45,699 : INFO : collecting document frequencies
2019-04-12 16:53:45,700 : INFO : PROGRESS: processing document #0
2019-04-12 16:53:45,700 : INFO : calculating IDF weights for 9 documents and 34 features (51 matrix non-zeros)


[(0, 0.4301019571350565), (1, 0.4301019571350565), (2, 0.2944198962221451), (3, 0.2944198962221451), (4, 0.2944198962221451), (5, 0.4301019571350565), (6, 0.4301019571350565)]
[(2, 0.3726494271826947), (7, 0.5443832091958983), (8, 0.3726494271826947), (9, 0.3726494271826947), (10, 0.27219160459794917), (11, 0.3726494271826947), (12, 0.27219160459794917)]
[(4, 0.438482464916089), (10, 0.32027755044706185), (12, 0.32027755044706185), (13, 0.438482464916089), (14, 0.6405551008941237)]
[(3, 0.3449874408519962), (10, 0.5039733231394895), (13, 0.3449874408519962), (15, 0.5039733231394895), (16, 0.5039733231394895)]
[(8, 0.30055933182961736), (11, 0.30055933182961736), (12, 0.21953536176370683), (17, 0.43907072352741366), (18, 0.43907072352741366), (19, 0.43907072352741366), (20, 0.43907072352741366)]
[(21, 0.48507125007266594), (22, 0.48507125007266594), (23, 0.48507125007266594), (24, 0.24253562503633297), (25, 0.48507125007266594)]
[(24, 0.31622776601683794), (26, 0.31622776601683794), (27

Create an LSA Model with the text:

In [64]:
NumTopics = 100
# initialize an LSI transformation
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NumTopics) 
corpus_lsi = lsi[corpus]
    
# Model persistency is achieved with the save() and load() functions:
ModelName = "Models/" + str(N) + "-gram_model_" + str(NumTopics) + ".lsi"
print(ModelName)


2019-04-12 16:24:24,422 : INFO : using serial LSI version on this node
2019-04-12 16:24:24,423 : INFO : updating model with new documents
2019-04-12 16:24:24,423 : INFO : preparing a new chunk of documents
2019-04-12 16:24:24,424 : INFO : using 100 extra samples and 2 power iterations
2019-04-12 16:24:24,424 : INFO : 1st phase: constructing (35, 200) action matrix
2019-04-12 16:24:24,425 : INFO : orthonormalizing (35, 200) action matrix
2019-04-12 16:24:24,427 : INFO : 2nd phase: running dense svd on (35, 9) matrix
2019-04-12 16:24:24,428 : INFO : computing the final decomposition
2019-04-12 16:24:24,428 : INFO : keeping 9 factors (discarding 0.000% of energy spectrum)
2019-04-12 16:24:24,429 : INFO : processed documents up to #9
2019-04-12 16:24:24,430 : INFO : topic #0(3.594): 0.579*"system" + 0.376*"user" + 0.270*"eps" + 0.257*"time" + 0.257*"response" + 0.230*"computer" + 0.224*"human" + 0.191*"interface" + 0.176*"survey" + 0.157*"opinion"
2019-04-12 16:24:24,430 : INFO : topic #1(

Models/1-gram_model_100.lsi


In [66]:
lsi.save(ModelName) # same for tfidf, lda, ...

# And, to load a previously saved model:
# lsi = models.LsiModel.load(ModelName)


2019-04-12 16:27:34,098 : INFO : saving Projection object under Models/1-gram_model_100.lsi.projection, separately None
2019-04-12 16:27:34,099 : INFO : saved Models/1-gram_model_100.lsi.projection
2019-04-12 16:27:34,100 : INFO : saving LsiModel object under Models/1-gram_model_100.lsi, separately None
2019-04-12 16:27:34,101 : INFO : not storing attribute projection
2019-04-12 16:27:34,101 : INFO : not storing attribute dispatcher
2019-04-12 16:27:34,102 : INFO : saved Models/1-gram_model_100.lsi


In [75]:

# we could also initialize an LSI transformation on the TFIDF weighted corpus:
lsi_t = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NumTopics) 
corpus_lsi_t = lsi_t[corpus_tfidf]
    
# Model persistency is achieved with the save() and load() functions:
ModelName = "Models/" + str(N) + "-gram_tfidf_model_" + str(NumTopics) + ".lsi"
print(ModelName)


2019-04-12 16:55:52,372 : INFO : using serial LSI version on this node
2019-04-12 16:55:52,373 : INFO : updating model with new documents
2019-04-12 16:55:52,374 : INFO : preparing a new chunk of documents
2019-04-12 16:55:52,374 : INFO : using 100 extra samples and 2 power iterations
2019-04-12 16:55:52,375 : INFO : 1st phase: constructing (35, 200) action matrix
2019-04-12 16:55:52,375 : INFO : orthonormalizing (35, 200) action matrix
2019-04-12 16:55:52,377 : INFO : 2nd phase: running dense svd on (35, 9) matrix
2019-04-12 16:55:52,378 : INFO : computing the final decomposition
2019-04-12 16:55:52,378 : INFO : keeping 9 factors (discarding 0.000% of energy spectrum)
2019-04-12 16:55:52,379 : INFO : processed documents up to #9
2019-04-12 16:55:52,379 : INFO : topic #0(1.266): 0.400*"system" + 0.318*"survey" + 0.290*"user" + 0.274*"eps" + 0.236*"management" + 0.236*"opinion" + 0.235*"response" + 0.235*"time" + 0.224*"interface" + 0.224*"computer"
2019-04-12 16:55:52,380 : INFO : topi

Models/1-gram_tfidf_model_100.lsi


In [68]:
NumWords = 5
# Create a list with the 4 most important topics per document
sorted_corpus_lsi = [sorted(doc, key = lambda x: x[1], reverse = True)[:4] for doc in corpus_lsi]

# Count the frequency of occurrence of each LSA topic
topics_count  = defaultdict(int)
for n in sorted_corpus_lsi:
    for t in n:
        topics_count[t[0]] += 1
     
# To make computation easier, we will turn everything into a dataframe, and sort it on frequency:
df = pd.DataFrame.from_dict(topics_count, orient='index', dtype=None)
df_sorted = df.sort_values([0], ascending=[False])

df_sorted["topicID"] = df_sorted.index
    
# Add a column with the top NumWords words belonging to each topicID:
df_sorted["topicWORDS"] = df_sorted["topicID"].apply(lsi.show_topic, topn = NumWords)
    
# Lets save the Data Frame:
dfName = "Models/" + str(N) + "-gram_LSA_model_" + str(NumTopics) + ".pkl"
print(dfName)

df_sorted.to_pickle(dfName)    #to save the dataframe df_sorted to a pickle file, .pkl 
# df_sorted = pd.read_pickle(dfName) #to load the .pkl file back to the dataframe df_sorted
    
df_sorted

Models/1-gram_LSA_model_100.pkl


Unnamed: 0,0,topicID,topicWORDS
0,8,0,"[(system, 0.578819723500462), (user, 0.3761191..."
3,5,3,"[(computer, 0.3707031576945169), (machine, 0.3..."
5,5,5,"[(survey, -0.5119821128613758), (opinion, -0.3..."
1,3,1,"[(graph, -0.48022933420716), (trees, -0.463778..."
2,3,2,"[(response, 0.35877026141443874), (time, 0.358..."
8,3,8,"[(minors, -0.47153189957653346), (survey, -0.3..."
6,3,6,"[(intersection, -0.4962393745482648), (paths, ..."
7,3,7,"[(management, 0.49502605579427394), (interface..."
4,3,4,"[(binary, -0.41214564245205937), (generation, ..."


Create an LDA Model with the text:

In [70]:
NumTopics = 100
# initialize an LSI transformation
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=NumTopics) 
corpus_lda = lda[corpus]
    
# Model persistency is achieved with the save() and load() functions:
ModelName = "Models/" + str(N) + "-gram_model_" + str(NumTopics) + ".lda"
print(ModelName)

lda.save(ModelName)

2019-04-12 16:33:48,105 : INFO : using symmetric alpha at 0.01
2019-04-12 16:33:48,106 : INFO : using symmetric eta at 0.01
2019-04-12 16:33:48,107 : INFO : using serial LDA version on this node
2019-04-12 16:33:48,108 : INFO : running online (single-pass) LDA training, 100 topics, 1 passes over the supplied corpus of 9 documents, updating model once every 9 documents, evaluating perplexity every 9 documents, iterating 50x with a convergence threshold of 0.001000
2019-04-12 16:33:48,113 : INFO : -214.317 per-word bound, 32800323523486367269542980686831505104316505682689996944314466304.0 perplexity estimate based on a held-out corpus of 9 documents with 52 words
2019-04-12 16:33:48,114 : INFO : PROGRESS: pass 0, at document #9/9
2019-04-12 16:33:48,118 : INFO : topic #77 (0.010): 0.029*"graph" + 0.029*"intersection" + 0.029*"relation" + 0.029*"binary" + 0.029*"generation" + 0.029*"random" + 0.029*"trees" + 0.029*"unordered" + 0.029*"measurement" + 0.029*"minors"
2019-04-12 16:33:48,118 

Models/1-gram_model_100.lda


In [72]:
NumWords = 5
# Create a list with the 4 most important topics per document
sorted_corpus_lda = [sorted(doc, key = lambda x: x[1], reverse = True)[:4] for doc in corpus_lda]

# Count the frequency of occurrence of each LSA topic
topics_count  = defaultdict(int)
for n in sorted_corpus_lda:
    for t in n:
        topics_count[t[0]] += 1
     
# To make computation easier, we will turn everything into a dataframe, and sort it on frequency:
df = pd.DataFrame.from_dict(topics_count, orient='index', dtype=None)
df_sorted = df.sort_values([0], ascending=[False])

df_sorted["topicID"] = df_sorted.index
    
# Add a column with the top NumWords words belonging to each topicID:
df_sorted["topicWORDS"] = df_sorted["topicID"].apply(lda.show_topic, topn = NumWords)
    
# Lets save the Data Frame:
dfName = "Models/" + str(N) + "-gram_LDA_model_" + str(NumTopics) + ".pkl"
print(dfName)

df_sorted.to_pickle(dfName)    #to save the dataframe df_sorted to a pickle file, .pkl 
# df_sorted = pd.read_pickle(dfName) #to load the .pkl file back to the dataframe df_sorted
    
df_sorted

Models/1-gram_LDA_model_100.pkl


Unnamed: 0,0,topicID,topicWORDS
84,2,84,"[(time, 0.13741508), (opinion, 0.13741504), (c..."
4,2,4,"[(trees, 0.120958194), (ordering, 0.12095818),..."
17,1,17,"[(interface, 0.13741532), (human, 0.13741527),..."
23,1,23,"[(eps, 0.18878533), (system, 0.18878524), (man..."
7,1,7,"[(system, 0.31653577), (testing, 0.15905508), ..."
28,1,28,"[(time, 0.13741516), (relation, 0.13741508), (..."
27,1,27,"[(random, 0.18878518), (trees, 0.18878509), (b..."
96,1,96,"[(trees, 0.23218404), (graph, 0.23218378), (in..."
