### Gensim is a Natural Language Processing package that does ‘Topic Modeling for Humans’. But it is practically much more than that. It is a leading and a state-of-the-art package for the purpose of  processing texts, working with word vector models (such as Word2Vec, FastText etc) and for building topic models.

In [1]:
import os 
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import keras

from gensim import corpora,logger,models,parsing 


## Load Text Data 
doc1_path = os.getcwd()+"/GeneralText.txt"
doc2_path = os.getcwd()+"/GeneralText2.txt"
doc_1 = open(doc1_path,"r")
doc_2 = open(doc2_path,"r")

Using TensorFlow backend.


In [2]:
## Initial Preprocessing for the documents
## conversion to lower case 
## removal of special chars
## removal of unwanted identifiers

#### On Hold so far #########

In [3]:
doc1 = [i for i in doc_1.readline().split(".")]
doc2 = [i for i in doc_2.readline().split(".")]

In [4]:
doc1

["History (from Greek ἱστορία, historia, meaning 'inquiry; knowledge acquired by investigation')[2] is the past as it is described in written documents, and the study thereof",
 '[3][4] Events occurring before written records are considered prehistory',
 ' "History" is an umbrella term that relates to past events as well as the memory, discovery, collection, organization, presentation, and interpretation of information about these events',
 ' Scholars who write about history are called historians',
 '\n']

In [5]:
doc2

['Coming of age ceremonies have been celebrated in Japan since at least 714 CE, when a young prince donned new robes and a hairstyle to mark his passage into adulthood',
 '  The holiday was first established in 1948, to be held every year on January 15',
 '  In 2000, as a result of the Happy Monday System, Coming of Age Day was changed to the second Monday in January',
 ' \n']

In [6]:
## Converting docs to list of sentences
## list of list of words in every lineof the document

docList1 = [[word for word in line.split()] for line in doc1]
docList2 = [[word for word in line.split()] for line in doc2]

In [7]:
docList2

[['Coming',
  'of',
  'age',
  'ceremonies',
  'have',
  'been',
  'celebrated',
  'in',
  'Japan',
  'since',
  'at',
  'least',
  '714',
  'CE,',
  'when',
  'a',
  'young',
  'prince',
  'donned',
  'new',
  'robes',
  'and',
  'a',
  'hairstyle',
  'to',
  'mark',
  'his',
  'passage',
  'into',
  'adulthood'],
 ['The',
  'holiday',
  'was',
  'first',
  'established',
  'in',
  '1948,',
  'to',
  'be',
  'held',
  'every',
  'year',
  'on',
  'January',
  '15'],
 ['In',
  '2000,',
  'as',
  'a',
  'result',
  'of',
  'the',
  'Happy',
  'Monday',
  'System,',
  'Coming',
  'of',
  'Age',
  'Day',
  'was',
  'changed',
  'to',
  'the',
  'second',
  'Monday',
  'in',
  'January'],
 []]

In [8]:
## Assign every token in the text an unique id
## Gensim is very smart in that task, if the text is huge and unable to fit ion memory
## you can still work with that by dynamic id assignment
##### Let's convert the entire text to dictionary which contains token as key and unique id  as value.

# 1.Convert the text to list which is tokenized 
# 2. Pass the list to gensim corpora.Dictionary() to create the dictionary of tokens with id.

## Some NLP Jargon::
## Token =  word
## A ‘document’ can typically refer to a ‘sentence’ or ‘paragraph’.
## A ‘corpus’ is typically a ‘collection of documents as a bag of words’.

# bow = []
# with open(file_Path) as fp:
#     line = fp.readline()
#     while line:
#         line = fp.readline()
#         text_list = list(line.split(" "))
#         bow.append(text_list)    

        
dictionary = corpora.Dictionary(docList1)

In [9]:
print(dictionary)
dictionary.token2id

Dictionary(56 unique tokens: ["'inquiry;", '(from', 'Greek', 'History', 'acquired']...)


{"'inquiry;": 0,
 '(from': 1,
 'Greek': 2,
 'History': 3,
 'acquired': 4,
 'and': 5,
 'as': 6,
 'by': 7,
 'described': 8,
 'documents,': 9,
 'historia,': 10,
 'in': 11,
 "investigation')[2]": 12,
 'is': 13,
 'it': 14,
 'knowledge': 15,
 'meaning': 16,
 'past': 17,
 'study': 18,
 'the': 19,
 'thereof': 20,
 'written': 21,
 'ἱστορία,': 22,
 'Events': 23,
 '[3][4]': 24,
 'are': 25,
 'before': 26,
 'considered': 27,
 'occurring': 28,
 'prehistory': 29,
 'records': 30,
 '"History"': 31,
 'about': 32,
 'an': 33,
 'collection,': 34,
 'discovery,': 35,
 'events': 36,
 'information': 37,
 'interpretation': 38,
 'memory,': 39,
 'of': 40,
 'organization,': 41,
 'presentation,': 42,
 'relates': 43,
 'term': 44,
 'that': 45,
 'these': 46,
 'to': 47,
 'umbrella': 48,
 'well': 49,
 'Scholars': 50,
 'called': 51,
 'historians': 52,
 'history': 53,
 'who': 54,
 'write': 55}

In [10]:
# docList2
dictionary.add_documents(docList2)

In [11]:
dictionary.token2id

{"'inquiry;": 0,
 '(from': 1,
 'Greek': 2,
 'History': 3,
 'acquired': 4,
 'and': 5,
 'as': 6,
 'by': 7,
 'described': 8,
 'documents,': 9,
 'historia,': 10,
 'in': 11,
 "investigation')[2]": 12,
 'is': 13,
 'it': 14,
 'knowledge': 15,
 'meaning': 16,
 'past': 17,
 'study': 18,
 'the': 19,
 'thereof': 20,
 'written': 21,
 'ἱστορία,': 22,
 'Events': 23,
 '[3][4]': 24,
 'are': 25,
 'before': 26,
 'considered': 27,
 'occurring': 28,
 'prehistory': 29,
 'records': 30,
 '"History"': 31,
 'about': 32,
 'an': 33,
 'collection,': 34,
 'discovery,': 35,
 'events': 36,
 'information': 37,
 'interpretation': 38,
 'memory,': 39,
 'of': 40,
 'organization,': 41,
 'presentation,': 42,
 'relates': 43,
 'term': 44,
 'that': 45,
 'these': 46,
 'to': 47,
 'umbrella': 48,
 'well': 49,
 'Scholars': 50,
 'called': 51,
 'historians': 52,
 'history': 53,
 'who': 54,
 'write': 55,
 '714': 56,
 'CE,': 57,
 'Coming': 58,
 'Japan': 59,
 'a': 60,
 'adulthood': 61,
 'age': 62,
 'at': 63,
 'been': 64,
 'celebrated'

#### From the above created dictionary , we will create the Bag of Words corpus, which Gensim will use for token replacement in the document.

In [12]:
def roller(document):
    finalDoc = list()
    for i in document:
        for j in i:
            finalDoc.append(j)
    return finalDoc

In [13]:
document_1 = roller(docList1)
documrnt_2 = roller(docList2)

In [14]:
## creating BOW from dictionary
Bow_1 = dictionary.doc2bow(document_1,allow_update=True)
Bow_2 = dictionary.doc2bow(document_1,allow_update=True)

In [16]:
## How to persist dictionary and BoW to Disk
dictionary.save(os.getcwd()+"Gensim_dict.dict")
# corpora.MmCorpus.serialize("Bow_1.mm", Bow_1)