In [1]:
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/ralampay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ralampay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
raw_text = "This is an example of some text. Let's prepare this by invoking a set of routines!\nAn example of word embeddings would be shown at the bottom as a result of preparation."

print(raw_text)

This is an example of some text. Let's prepare this by invoking a set of routines!
An example of word embeddings would be shown at the bottom as a result of preparation.


In [3]:
processed_text = re.sub(r'[^\x00-\x7f]', r'', raw_text)

processed_text

"This is an example of some text. Let's prepare this by invoking a set of routines!\nAn example of word embeddings would be shown at the bottom as a result of preparation."

In [4]:
lowered_text = raw_text.lower()

lowered_text

"this is an example of some text. let's prepare this by invoking a set of routines!\nan example of word embeddings would be shown at the bottom as a result of preparation."

In [5]:
word_tokens = word_tokenize(processed_text)

word_tokens

['This',
 'is',
 'an',
 'example',
 'of',
 'some',
 'text',
 '.',
 'Let',
 "'s",
 'prepare',
 'this',
 'by',
 'invoking',
 'a',
 'set',
 'of',
 'routines',
 '!',
 'An',
 'example',
 'of',
 'word',
 'embeddings',
 'would',
 'be',
 'shown',
 'at',
 'the',
 'bottom',
 'as',
 'a',
 'result',
 'of',
 'preparation',
 '.']

In [6]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in word_tokens if token.lower() not in stop_words]

filtered_tokens

['example',
 'text',
 '.',
 'Let',
 "'s",
 'prepare',
 'invoking',
 'set',
 'routines',
 '!',
 'example',
 'word',
 'embeddings',
 'would',
 'shown',
 'bottom',
 'result',
 'preparation',
 '.']

In [7]:
word_vocabulary = list(set(sorted(filtered_tokens)))

word_vocabulary

['prepare',
 '!',
 'set',
 'text',
 "'s",
 'result',
 'word',
 'bottom',
 'invoking',
 'would',
 'routines',
 'shown',
 'Let',
 'preparation',
 'embeddings',
 'example',
 '.']

In [8]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in word_vocabulary]

stemmed_tokens

['prepar',
 '!',
 'set',
 'text',
 "'s",
 'result',
 'word',
 'bottom',
 'invok',
 'would',
 'routin',
 'shown',
 'let',
 'prepar',
 'embed',
 'exampl',
 '.']

In [9]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in word_vocabulary]

lemmatized_tokens

['prepare',
 '!',
 'set',
 'text',
 "'s",
 'result',
 'word',
 'bottom',
 'invoking',
 'would',
 'routine',
 'shown',
 'Let',
 'preparation',
 'embeddings',
 'example',
 '.']

In [10]:
word_indices = dict((word, index) for index, word in enumerate(word_vocabulary))

word_indices

{'prepare': 0,
 '!': 1,
 'set': 2,
 'text': 3,
 "'s": 4,
 'result': 5,
 'word': 6,
 'bottom': 7,
 'invoking': 8,
 'would': 9,
 'routines': 10,
 'shown': 11,
 'Let': 12,
 'preparation': 13,
 'embeddings': 14,
 'example': 15,
 '.': 16}

In [11]:
indices_words = dict((index, word) for index, word in enumerate(word_vocabulary))

indices_words

{0: 'prepare',
 1: '!',
 2: 'set',
 3: 'text',
 4: "'s",
 5: 'result',
 6: 'word',
 7: 'bottom',
 8: 'invoking',
 9: 'would',
 10: 'routines',
 11: 'shown',
 12: 'Let',
 13: 'preparation',
 14: 'embeddings',
 15: 'example',
 16: '.'}