In [19]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pandas as pd
np.random.seed(2018)

In [20]:
stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [21]:
df = pd.DataFrame([
    'Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation',
    'Words that have fewer than 3 characters are removed',
    'All stopwords are removed',
    'Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present',
    'Words are stemmed — words are reduced to their root form'
], columns = ['content'])

In [22]:
df

In [23]:
processed_docs = df['content'].map(preprocess)

In [24]:
processed_docs

0    [token, split, text, sentenc, sentenc, word, l...
1                        [word, fewer, charact, remov]
2                                    [stopword, remov]
3    [word, lemmat, word, person, chang, person, ve...
4                [word, stem, word, reduc, root, form]
Name: content, dtype: object

In [33]:
dictionary = gensim.corpora.Dictionary(processed_docs) # Initialize the dictionary

In [34]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1850b5b3a20>

In [35]:
#dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [36]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [37]:
bow_corpus

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 2)],
 [(2, 1), (7, 1), (8, 1), (9, 1)],
 [(2, 1), (10, 1)],
 [(7, 2),
  (11, 2),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 2),
  (16, 1),
  (17, 1),
  (18, 1)],
 [(7, 2), (19, 1), (20, 1), (21, 1), (22, 1)]]

In [38]:
len(bow_corpus)

5

In [39]:
type(bow_corpus)

list

In [43]:
bow_corpus

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 2)],
 [(2, 1), (7, 1), (8, 1), (9, 1)],
 [(2, 1), (10, 1)],
 [(7, 2),
  (11, 2),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 2),
  (16, 1),
  (17, 1),
  (18, 1)],
 [(7, 2), (19, 1), (20, 1), (21, 1), (22, 1)]]

In [45]:
for x in bow_corpus:
    print(x)

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 2)]
[(2, 1), (7, 1), (8, 1), (9, 1)]
[(2, 1), (10, 1)]
[(7, 2), (11, 2), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1)]
[(7, 2), (19, 1), (20, 1), (21, 1), (22, 1)]


In [46]:
for i in range(len(bow_corpus)):
    print("Word {} (\"{}\") appears {} time.".format(bow_corpus[i][0], 
                                                     dictionary[bow_corpus[i][0]], 
                                                     bow_corpus[i][1]))

KeyError: (0, 1)