In [1]:
import nltk
import numpy as np
import math
import pickle
import sys
from bs4 import BeautifulSoup as bsoup
from spellchecker import SpellChecker
from nltk.stem import PorterStemmer

In [2]:
ps = PorterStemmer()

punctuations = ['.', ',', '!', '\'', '\"',
                '(', ')', '[', ']', '{', '}', '?', '\\', '/', '~', '|', '<', '>']

In [3]:
np.set_printoptions(threshold=sys.maxsize)

In [4]:
def build_vocabulary_freqdist(vocabulary):
    vocabulary = nltk.FreqDist(vocabulary)
    vocabulary = sorted(vocabulary.items(), key=lambda x: x[0])
    vocabulary = dict(vocabulary)
    return vocabulary

In [5]:
def stem_the_vocab(vocabulary_dict):

    vocabulary_keys = list(vocabulary_dict.keys())
    vocabulary_values = list(vocabulary_dict.values())

    porter = nltk.PorterStemmer()
    stemmed_vocab = {}

    for word_id, word in enumerate(vocabulary_keys):
        if porter.stem(word) not in stemmed_vocab:
            stemmed_vocab[porter.stem(word)] = (
                word, vocabulary_values[word_id])
        else:
            if stemmed_vocab[porter.stem(word)][1] < vocabulary_values[word_id]:
                stemmed_vocab[porter.stem(word)] = (
                    word, vocabulary_values[word_id])

    with open("stemmed_vocab.pkl", "wb") as f:
        pickle.dump(stemmed_vocab, f)

In [6]:
def build_database_vocabulary(filename):
    with open(filename) as f:
        text = f.read().replace('\n', ' ')
    database = []
    vocabulary = []
    doc_count = 0
    i = 0
    doc_titles = []

    while(i < len(text)):

        if(text[i] == '<' and text[i+1] == 'd' and text[i+2] == 'o' and text[i+3] == 'c'):

            # keep finding the end of the doc begin tag
            flag = 0
            title = ""

            while(text[i] != '>'):
                i = i+1
                if(text[i-6] == 't' and text[i-5] == 'i' and text[i-4] == 't' and text[i-3] == 'l' and text[i-2] == 'e' and text[i-1] == '='):
                    flag = 1

                if(flag == 1 and text[i] != '>'):
                    title = title + str(text[i])

            doc_titles.append(title)

            i = i + 2

            document = ''

            while(not (text[i] == '<' and text[i+1] == '/' and text[i+2] == 'd' and text[i+3] == 'o')):
                document = document + str(text[i])
                i = i+1

            while(text[i] != '>'):
                i = i+1

            doc_count = doc_count + 1

            if(doc_count % 200 == 0):
                print("Document " + str(doc_count) + " is being read...")

            # pre-processing
            clean_document = bsoup(document, 'html.parser').get_text()
            clean_document = clean_document.lower()
            # tokens is a list of tokens
            tokens = nltk.word_tokenize(clean_document)
            tokens = [token for token in tokens if token not in punctuations]

            for token in tokens:
                vocabulary.append(token)

            # saving the file / document wise write-back
            # database is a list of list, as mentioned above
            database.append(tokens)

        i = i + 1

    print("Done with the document reading...Database is ready \n")
    print("There are total " + str(doc_count) + " documents!")
    print("There are total " + str(len(doc_titles)) + " titles!")

    # if(doc_count > 3):
    #     break

    # vocabulary is a dictionary of words/tokens and their corpus frequencies
    vocabulary = build_vocabulary_freqdist(vocabulary)

    print("Done with the Vocabulary building...Vocab is ready and saved !\n")

    with open("vocabulary_dict.pkl", "wb") as f:
        pickle.dump(vocabulary, f)

    with open("doc_titles.pkl", "wb") as f:
        pickle.dump(doc_titles, f)

    stem_the_vocab(vocabulary)

    # print(vocabulary)
    return database, vocabulary

In [7]:
def build_documents_vector(database, vocabulary_words, inverse_vocab_word_dict):
    documents_vector = np.zeros((len(database), len(vocabulary_words)))

    # populate the documents_vector with the frequency of each vocabulary word for each document
    for doc_id, doc in enumerate(database):
        for token in doc:
            documents_vector[doc_id][inverse_vocab_word_dict[token]
                                     ] = documents_vector[doc_id][inverse_vocab_word_dict[token]] + 1
    print("Done with the Documents Vector build... Saving it as numpy file \n")
    np.save("documents_vector.npy", documents_vector)
    return documents_vector

In [8]:
def process_documents_vector(documents_vector):

    for i in range(0, documents_vector.shape[0]):
        for j in range(0, documents_vector.shape[1]):
            if(documents_vector[i][j] > 0):
                documents_vector[i][j] = 1 + math.log(documents_vector[i][j])

    print("Done with the log calculation build... \n")

    # calculation of cosine normalisation
    temp_documents_vector = np.copy(documents_vector)
    temp_documents_vector = np.square(temp_documents_vector)
    temp_documents_vector = np.sum(temp_documents_vector, axis=1)
    temp_documents_vector = np.sqrt(temp_documents_vector)
    documents_vector = np.divide(
        documents_vector, temp_documents_vector[:, None])

    print("Done with the cosine normalization build... \n")
    return documents_vector

In [9]:
def index_construction(filename):
    database, vocabulary = build_database_vocabulary(filename)
    # how many unique words
    vocabulary_words = list(vocabulary.keys())
    inverse_vocab_word_dict = {k: v for v, k in enumerate(vocabulary_words)}
    documents_vector = build_documents_vector(
        database, vocabulary_words, inverse_vocab_word_dict)
    documents_vector = process_documents_vector(documents_vector)
    np.save("database_lnc.npy", documents_vector)
    print("Saved! database_lnc.npy")

In [10]:
index_construction("mega_wiki_corpus")

Document 200 is being read...
Document 400 is being read...
Document 600 is being read...
Document 800 is being read...
Done with the document reading...Database is ready 

There are total 963 documents!
There are total 963 titles!
Done with the Vocabulary building...Vocab is ready and saved !

Done with the Documents Vector build... Saving it as numpy file 

Done with the log calculation build... 

Done with the cosine normalization build... 

Saved! database_lnc.npy
