In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [8]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\pratyush\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

In [9]:
from nltk.corpus import brown

In [11]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [12]:
data = brown.sents(categories='adventure')

In [13]:
data

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]

# Bag of words pipeline

In [14]:
# Tokenisation & Stopword removal

doc = """It was a very pleasant day. The weather was cool and there were light showers. 
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents related to chapters 1,2,3 at xyz@abc.com"

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [16]:
sents = sent_tokenize(doc) # 1. Using sent_tokenisze 
print(sents)

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [17]:
sentence.split()
# Didnt remove "1,2,3"

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'xyz@abc.com']

In [18]:
# 2. Using word tokenize function
words = word_tokenize(sentence)
words
# Even special characters are broken 

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'xyz',
 '@',
 'abc.com']

In [28]:
# Stopwords removal
# NLTK already has stopwords collections
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

In [31]:
# sw
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [33]:
text = "you're a very bad person".split()
useful_text = remove_stopwords(text,sw)
useful_text

['bad', 'person']

# Tokenization using regular expression

In [34]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at xyz@abc.com"


In [35]:
from nltk.tokenize import RegexpTokenizer

In [36]:
tokenizer = RegexpTokenizer('[a-zA-Z]+')
useful_text = tokenizer.tokenize(sentence)
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'xyz',
 'abc',
 'com']

# Stemming

In [37]:
text = """Foxes love to make jumps. 
The quick brown fox was seen jumping over the lovely dog from a 
6ft feethigh wall"""

In [38]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [46]:
ps = PorterStemmer()

In [48]:
ps.stem('jumps')

'jump'

In [51]:
ss = SnowballStemmer('french')
ss.stem('aimes')

'aim'

In [58]:
### Lemmantization, same as stemming; only one of them is needed 
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()
wn.lemmatize('jumpy')

<WordNetLemmatizer>


### Vectorisation


In [60]:
corpus = [
    'Indian cricket team will win the world cup',
    'We will win the next elections',
    'Climate change is real',
    'The movie raazi was very thrilling'    
]

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
cv = CountVectorizer()

In [75]:
vectorized_corpus = cv.fit_transform(corpus)


<4x20 sparse matrix of type '<class 'numpy.int64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [78]:
vectorized_corpus = vectorized_corpus.toarray()

In [82]:
vectorized_corpus[0]

array([0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1],
      dtype=int64)

In [81]:
cv.vocabulary_

{'indian': 5,
 'cricket': 2,
 'team': 11,
 'will': 17,
 'win': 18,
 'the': 12,
 'world': 19,
 'cup': 3,
 'we': 16,
 'next': 8,
 'elections': 4,
 'climate': 1,
 'change': 0,
 'is': 6,
 'real': 10,
 'movie': 7,
 'raazi': 9,
 'was': 15,
 'very': 14,
 'thrilling': 13}

In [87]:
len(corpus[0])
len(corpus[1])

30

In [85]:
# Why different ???
len(cv.vocabulary_.keys())

20

In [88]:
# Reverse Mapping
numbers = vectorized_corpus[2]
numbers

array([1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [89]:
cv.inverse_transform(numbers)
#Notice words are jumbled up because BOG model 

[array(['change', 'climate', 'is', 'real'], dtype='<U9')]

# Vectorisation with stopwords removal

In [91]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # converting everything in lower case
    # removing stopwords
    words = remove_stopwords(words,sw)
    return words

In [92]:
# myTokenizer(sentence)

['send', 'documents', 'related', 'chapters', 'xyz', 'abc', 'com']

In [93]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [96]:
print(vectorized_corpus)

[[0 0 1 1 0 1 0 0 0 0 1 0 1 1]
 [0 0 0 0 1 0 0 1 0 0 0 0 1 0]
 [1 1 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 1 0 1 0 0 1 0 0]]


In [94]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [95]:
print(len(vectorized_corpus[0]))
# for test data do not call fit transform, only transform 

14


In [97]:
test_corpus = [
    'Indian cricket team is the best'    
]

In [98]:
cv.transform(test_corpus)

<1x14 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [99]:
cv.vocabulary_
#if fit transform is called, vocab size is shrunk. train: fit transform; test: transform 

{'indian': 5,
 'cricket': 2,
 'team': 10,
 'win': 12,
 'world': 13,
 'cup': 3,
 'next': 7,
 'elections': 4,
 'climate': 1,
 'change': 0,
 'real': 9,
 'movie': 6,
 'raazi': 8,
 'thrilling': 11}

# More ways to create features

In [108]:
sent_1 =["this is a good movie"]
sent_2 = ["this is a good movie but actor is not present"]
sent_3 = ["this is not good movie"]


In [117]:
cv = CountVectorizer(ngram_range=(3,3)) # by default range is 1,1 meaning unigram

In [118]:
docs = [sent_1[0], sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 1, 0, 0, 1],
       [1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [119]:
cv.vocabulary_

{'this is good': 6,
 'is good movie': 3,
 'good movie but': 2,
 'movie but actor': 5,
 'but actor is': 1,
 'actor is not': 0,
 'is not present': 4}

## tf idf normalization

In [124]:
sent_1 = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1, sent_2, sent_3]

In [120]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [121]:
tfidf = TfidfVectorizer()

In [125]:
vc = tfidf.fit_transform(corpus).toarray()

In [126]:
vc

array([[0.46333427, 0.59662724, 0.46333427, 0.        , 0.46333427,
        0.        ],
       [0.41285857, 0.        , 0.41285857, 0.        , 0.41285857,
        0.69903033],
       [0.3645444 , 0.46941728, 0.3645444 , 0.61722732, 0.3645444 ,
        0.        ]])

In [127]:
tfidf.vocabulary_
# we see not has a higher weight in vc

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}