In [72]:
import time
import random
from math import *
import operator
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 10000)
import string

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from matplotlib import style
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

# For text processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

# Importing Gensim
import gensim

## From Strings to Vectors

In [73]:
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

corpus of 9 documents, each consisting of only a single sentence.

First, let’s tokenize the documents, remove common words (using a toy stoplist) as well as words that only appear once in the corpus:

In [74]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())

texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]


In [75]:
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [76]:
from collections import defaultdict
from pprint import pprint

In [77]:
# remove words that appear only once
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

In [78]:
pprint(frequency)

defaultdict(<class 'int'>,
            {'abc': 1,
             'applications': 1,
             'binary': 1,
             'computer': 2,
             'engineering': 1,
             'eps': 2,
             'error': 1,
             'generation': 1,
             'graph': 3,
             'human': 2,
             'interface': 2,
             'intersection': 1,
             'iv': 1,
             'lab': 1,
             'machine': 1,
             'management': 1,
             'measurement': 1,
             'minors': 2,
             'opinion': 1,
             'ordering': 1,
             'paths': 1,
             'perceived': 1,
             'quasi': 1,
             'random': 1,
             'relation': 1,
             'response': 2,
             'survey': 2,
             'system': 4,
             'testing': 1,
             'time': 2,
             'trees': 3,
             'unordered': 1,
             'user': 3,
             'well': 1,
             'widths': 1})


In [79]:
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

In [80]:
texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [81]:
print(texts)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


To convert documents to vectors, we’ll use a document representation called bag-of-words.

In [109]:
from gensim import corpora

dictionary = corpora.Dictionary(texts)

dictionary.save(r'C:\Users\Sky\Desktop\SimpliLearn/gensimdict.dict')  # store the dictionary, for future reference

print(dictionary)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


there are twelve distinct words in the processed corpus, which means each document will be represented by twelve numbers (ie., by a 12-D vector).


To see the mapping between words and their ids:

In [83]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


convert tokenized documents to vectors:

In [84]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[(0, 1), (1, 1)]


#### doc2bow() 
- simply counts the number of occurrences of each distinct word, 
- converts the word to its integer word id and returns the result as a sparse vector. 

The sparse vector [(0, 1), (1, 1)] therefore reads: in the document “Human computer interaction”, the words computer (id 0) and human (id 1) appear once; the other ten dictionary words appear (implicitly) zero times.

In [85]:
corpus = [dictionary.doc2bow(text) for text in texts]

corpora.MmCorpus.serialize(r'C:\Users\Sky\Desktop\SimpliLearn/gensimbow.mm', corpus)  # store to disk, for later use
print(corpus)

[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


## Corpus Streaming – One Document at a Time

Note that corpus above resides fully in memory, as a plain Python list. 

let’s assume there are millions of documents in the corpus. Storing all of them in RAM won’t do. 

Instead, let’s assume the documents are stored in a file on disk, one document per line. 

## How to create a Dictionary from a list of sentences?

dictionary contains a map of all words (tokens) to its unique id

You can create a dictionary 
- from a paragraph of sentences, 
- from a text file that contains multiple lines of text and 
- from multiple such text files contained in a directory.

In [100]:
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

In [101]:
# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [102]:
# Show the word to id map
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


_With new documents in the future, it is also possible to update an existing dictionary to include the new words._

In [103]:
texts_2 = [[text for text in doc.split()] for doc in documents_2]

In [104]:
dictionary.add_documents(texts_2)

In [105]:
print(dictionary)

Dictionary(60 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


## create a Dictionary from one or more text files?

You can also create a dictionary from a text file or from a directory of text files.

The advantage here is it let’s you read an entire text file __without loading the file in memory__ all at once.

In [63]:
from gensim.utils import simple_preprocess
from smart_open import smart_open
import os

In [106]:
# Create gensim dictionary form a single tet file
dictionary = corpora.Dictionary(simple_preprocess(line, deacc=True) 
                                for line in open(r'C:\Users\Sky\Desktop\SimpliLearn\NLP_Trainer_PPT_July\NLP_Trainer_PPT_July\Projects\0.1 Assisted Practices\Lesson 5\Assisted Practice 1\nlp_doc.txt', encoding='utf-8'))

In [107]:
print(dictionary)

Dictionary(416 unique tokens: ['alan', 'although', 'an', 'and', 'article']...)


In [108]:
# Token to Id map
dictionary.token2id

{'alan': 0,
 'although': 1,
 'an': 2,
 'and': 3,
 'article': 4,
 'as': 5,
 'be': 6,
 'called': 7,
 'can': 8,
 'clarification': 9,
 'computing': 10,
 'criterion': 11,
 'earlier': 12,
 'found': 13,
 'from': 14,
 'generally': 15,
 'history': 16,
 'in': 17,
 'intelligence': 18,
 'is': 19,
 'language': 20,
 'machinery': 21,
 'natural': 22,
 'needed': 23,
 'nlp': 24,
 'now': 25,
 'of': 26,
 'periods': 27,
 'processing': 28,
 'proposed': 29,
 'published': 30,
 'started': 31,
 'test': 32,
 'the': 33,
 'titled': 34,
 'turing': 35,
 'what': 36,
 'which': 37,
 'work': 38,
 'after': 39,
 'alpac': 40,
 'authors': 41,
 'automatic': 42,
 'claimed': 43,
 'conducted': 44,
 'developed': 45,
 'dramatically': 46,
 'english': 47,
 'expectations': 48,
 'experiment': 49,
 'failed': 50,
 'first': 51,
 'five': 52,
 'for': 53,
 'fulfill': 54,
 'fully': 55,
 'funding': 56,
 'further': 57,
 'georgetown': 58,
 'had': 59,
 'however': 60,
 'into': 61,
 'involved': 62,
 'late': 63,
 'little': 64,
 'long': 65,
 'machi

## multiple files?

Assuming you have all the text files in the same directory, you need to define a class with an __iter__ method. The __iter__() method should iterate through all the files in a given directory and yield the processed list of word tokens.

In [95]:
class ReadTxtFiles(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname), encoding='latin'):
                yield simple_preprocess(line)

In [96]:
path_to_text_directory = r"C:\Users\Sky\Desktop\SimpliLearn\NLP_Trainer_PPT_July\NLP_Trainer_PPT_July\Projects\0.1 Assisted Practices\Lesson 5\Assisted Practice 1\doc"

In [97]:
dictionary = corpora.Dictionary(ReadTxtFiles(path_to_text_directory))

In [98]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1c90291fa90>

In [99]:
# Token to Id map
dictionary.token2id

{'alan': 0,
 'although': 1,
 'an': 2,
 'and': 3,
 'article': 4,
 'as': 5,
 'be': 6,
 'called': 7,
 'can': 8,
 'clarification': 9,
 'computing': 10,
 'criterion': 11,
 'earlier': 12,
 'found': 13,
 'from': 14,
 'generally': 15,
 'history': 16,
 'in': 17,
 'intelligence': 18,
 'is': 19,
 'language': 20,
 'machinery': 21,
 'natural': 22,
 'needed': 23,
 'nlp': 24,
 'now': 25,
 'of': 26,
 'periods': 27,
 'processing': 28,
 'proposed': 29,
 'published': 30,
 'started': 31,
 'test': 32,
 'the': 33,
 'titled': 34,
 'turing': 35,
 'what': 36,
 'which': 37,
 'work': 38,
 'after': 39,
 'alpac': 40,
 'authors': 41,
 'automatic': 42,
 'claimed': 43,
 'conducted': 44,
 'developed': 45,
 'dramatically': 46,
 'english': 47,
 'expectations': 48,
 'experiment': 49,
 'failed': 50,
 'first': 51,
 'five': 52,
 'for': 53,
 'fulfill': 54,
 'fully': 55,
 'funding': 56,
 'further': 57,
 'georgetown': 58,
 'had': 59,
 'however': 60,
 'into': 61,
 'involved': 62,
 'late': 63,
 'little': 64,
 'long': 65,
 'machi