# Text Processing

In [1]:
import nltk

In [2]:
#nltk.download('brown')

In [3]:
from nltk.corpus import brown

In [4]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [5]:
brown.words(), len(brown.words())

(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...], 1161192)

In [6]:
data = brown.sents(categories=['mystery'])

In [7]:
" ".join(data[5])

'We had become good friends during my stay at Cook County Hospital .'

## Tokenization

In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [9]:
#nltk.download('punkt')

In [10]:
document = """ I love Travelling. I want to travel each and every beautiful places of the world. I am curious about being me."""

sentence = "Wanderlust! Lets Travel"

In [11]:
sents = sent_tokenize(document)
print(sents)
len(sents)

[' I love Travelling.', 'I want to travel each and every beautiful places of the world.', 'I am curious about being me.']


3

In [12]:
words = word_tokenize(sentence)               # also break down special characters
print(words)
len(words)

['Wanderlust', '!', 'Lets', 'Travel']


4

## StopWord Removal

In [13]:
from nltk.corpus import stopwords

In [14]:
sw = set(stopwords.words("english"))

In [15]:
sw

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [16]:
text = "I want to travel each and every beautiful places of the world".split()
print(text)

['I', 'want', 'to', 'travel', 'each', 'and', 'every', 'beautiful', 'places', 'of', 'the', 'world']


In [17]:
def remove_stopwords(text, stopwords):
    text = [word.lower() for word in text]
    useful = [word for word in text if word not in stopwords]
    return useful

In [18]:
useful_words = remove_stopwords(text, sw)

In [19]:
useful_words

['want', 'travel', 'every', 'beautiful', 'places', 'world']

In [20]:
# tokenisation using regex

In [21]:
sent = "My email is sameeksha@gmail.com, please don't spam my inbox"

In [22]:
from nltk.tokenize import RegexpTokenizer

In [23]:
tokenizer = RegexpTokenizer('[a-zA-Z@.\']+')
useful = tokenizer.tokenize(sent)
print(useful)

['My', 'email', 'is', 'sameeksha@gmail.com', 'please', "don't", 'spam', 'my', 'inbox']


## Stemming 

In [24]:
# nltk provides us: Porter, Snowball, Lancaster stemmers

In [25]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [26]:
ps = PorterStemmer()

In [27]:
ps.stem("wanting")

'want'

In [28]:
ps.stem("wanted")

'want'

In [29]:
# SnowballStemmer = Multilingul, supports other langs also.

## Count Vectorization to get Vectorized Corpus

In [30]:
corpus = [
    'Dan Morgan told himself he would forget Ann Turner.',
    'Sometimes he woke up in the middle of the night thinking of Ann , and then could not get back to sleep .',
    'His plans and dreams had revolved around her so much and for so long that now he felt as if he had nothing .',
    'He found that if he was tired enough at night , he went to sleep simply because he was too exhausted to stay awake .'
]

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
cv = CountVectorizer()

In [33]:
vc = cv.fit_transform(corpus)

In [34]:
vc = vc.toarray()
print(vc)
print(cv.vocabulary_)

[[0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1]
 [1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 2 0 0
  0 1 0 1 0 0 2 1 1 0 1 0 0 0 1 0 0 1 0]
 [2 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 2 2 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 1
  0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1 0 0 4 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
  1 1 0 0 1 1 0 0 0 1 2 0 1 0 0 2 1 0 0]]
{'dan': 9, 'morgan': 27, 'told': 47, 'himself': 21, 'he': 19, 'would': 54, 'forget': 15, 'ann': 1, 'turner': 49, 'sometimes': 39, 'woke': 53, 'up': 50, 'in': 24, 'the': 42, 'middle': 26, 'of': 33, 'night': 29, 'thinking': 44, 'and': 0, 'then': 43, 'could': 8, 'not': 30, 'get': 17, 'back': 6, 'to': 46, 'sleep': 37, 'his': 22, 'plans': 34, 'dreams': 10, 'had': 18, 'revolved': 35, 'around': 2, 'her': 20, 'so': 38, 'much': 28, 'for': 14, 'long': 25, 'that': 41, 'now': 32, 'felt': 13, 'as': 3, 'if': 23, 'nothing': 31, 'found': 16, 'was': 

In [35]:
print(len(cv.vocabulary_))                         # cv.vocabulary_ is the total words in corpus without removing stopwords

55


In [36]:
print(len(vc[2]))                                            # vc is the vectorize corpus which contains the count of words in the sentence.
                                                                    # All the text in vc are of same length.

55


In [37]:
print(len(vc[1]))

55


# Overall Text Processing

In [38]:
tokenizer = RegexpTokenizer('[a-zA-Z@.\']+')

def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())                           # Using Regex ( See above in StopWord Removal)
    # remove the stopwords
    words = remove_stopwords(words, sw)
    return words

In [39]:
myTokenizer('this is a random text')

['random', 'text']

In [66]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [67]:
vc = cv.fit_transform(corpus)

In [68]:
vc = vc.toarray()

In [69]:
print(vc)
print("length: ", len(vc[0]))

[[0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1]
 [1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0]
 [1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0]]
length:  33


In [70]:
cv.vocabulary_

{'dan': 6,
 'morgan': 16,
 'told': 28,
 'would': 32,
 'forget': 11,
 'ann': 1,
 'turner.': 29,
 'sometimes': 24,
 'woke': 31,
 'middle': 15,
 'night': 18,
 'thinking': 26,
 'could': 5,
 'get': 13,
 'back': 4,
 'sleep': 23,
 '.': 0,
 'plans': 20,
 'dreams': 7,
 'revolved': 21,
 'around': 2,
 'much': 17,
 'long': 14,
 'felt': 10,
 'nothing': 19,
 'found': 12,
 'tired': 27,
 'enough': 8,
 'went': 30,
 'simply': 22,
 'exhausted': 9,
 'stay': 25,
 'awake': 3}

In [71]:
sentn = "I want to travel each and every beautiful places of the world"

In [74]:
len(cv.transform([sentn]).toarray()[0]), cv.transform([sentn]).toarray()[0]

(33,
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))

In [73]:
cv.vocabulary_

{'dan': 6,
 'morgan': 16,
 'told': 28,
 'would': 32,
 'forget': 11,
 'ann': 1,
 'turner.': 29,
 'sometimes': 24,
 'woke': 31,
 'middle': 15,
 'night': 18,
 'thinking': 26,
 'could': 5,
 'get': 13,
 'back': 4,
 'sleep': 23,
 '.': 0,
 'plans': 20,
 'dreams': 7,
 'revolved': 21,
 'around': 2,
 'much': 17,
 'long': 14,
 'felt': 10,
 'nothing': 19,
 'found': 12,
 'tired': 27,
 'enough': 8,
 'went': 30,
 'simply': 22,
 'exhausted': 9,
 'stay': 25,
 'awake': 3}

In [63]:
sent = "My email is sameeksha@gmail.com, please don't spam my inbox"

In [64]:
len(cv.fit_transform([sent]).toarray()[0] )                        # prediction

5

In [65]:
cv.vocabulary_

{'email': 0, 'sameeksha@gmail.com': 3, 'please': 2, 'spam': 4, 'inbox': 1}