In [1]:
from nltk.corpus import brown

In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
data=brown.sents(categories='editorial')

In [4]:
len(data)

2997

In [5]:
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [6]:
# Tokenization

In [7]:
from nltk import sent_tokenize, word_tokenize

In [8]:
sent = "Where can I find past ACM ICPC regionals and finals questions with solutions?"

In [9]:
sents = sent_tokenize(sent.lower())

In [10]:
words = set(word_tokenize(sents[0]))

In [11]:
print(words)

{'finals', 'with', 'and', 'regionals', 'questions', 'find', 'where', 'acm', 'icpc', 'can', '?', 'past', 'solutions', 'i'}


In [12]:
# Stopwords

In [13]:
from nltk.corpus import stopwords

In [14]:
sw = set(stopwords.words('english'))

In [15]:
sw

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [16]:
useful_words = [w for w in words if w not in sw]
print(useful_words)

['finals', 'regionals', 'questions', 'find', 'acm', 'icpc', '?', 'past', 'solutions']


In [17]:
# regex tokenization

In [18]:
from nltk.tokenize import RegexpTokenizer

In [19]:
tokenizer = RegexpTokenizer("[a-zA-Z@]+")

In [21]:
text = "Hey there! how are you?"
print(tokenizer.tokenize(text))

['Hey', 'there', 'how', 'are', 'you']


### Stemming
- Process that transforms words(verbs, plurals) into their radical form 
- Preserve the semantics of the sentence without increasing the number of unique tokens
- Jumps, jumping, jumped, jump ==> jump

In [22]:
text = """Foxes love to make jumps. The quick brown fox was seen jumping over
        the lovely dog from a 6ft high wall."""
word_list = tokenizer.tokenize(text.lower())
print(word_list)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'high', 'wall']


In [24]:
def filter_words(words, sw):
    useful_words = [w for w in words if w not in sw]
    return useful_words

word_list = filter_words(word_list, sw)

In [25]:
word_list

['foxes',
 'love',
 'make',
 'jumps',
 'quick',
 'brown',
 'fox',
 'seen',
 'jumping',
 'lovely',
 'dog',
 'ft',
 'high',
 'wall']

### Stemming
- Snowball Stemmer
- Porter Stemmer
- Lancaster Stemmer

In [26]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [27]:
ps = PorterStemmer()

In [28]:
ps.stem("Jumps")

'jump'

In [29]:
ps.stem("Lovely")

'love'

In [30]:
ls = LancasterStemmer()

In [31]:
ls.stem("Lovely")

'lov'

In [33]:
ss = SnowballStemmer(language='english')

In [34]:
ss.stem("Lovely")

'love'

### Lemmatization

In [35]:
from nltk.stem import WordNetLemmatizer

In [36]:
lem = WordNetLemmatizer()

In [38]:
lem.lemmatize("crying")

'cry'

#### Building a common vocabulary and vectorizing documents

In [45]:
corpus = [
    'Presenting the second video song "Dekhte Dekhte" from the upcoming Hindi movie "Batti Gul Meter Chalu".',
    'This Bollywood rendition of Nusrat Fateh Ali Khan original song is done by Rochak Kohli and sung by Atif Aslam.',
    'Presenting the first official song Milegi Milegi sung by Mika Singh from the movie Stree.',
    'Stree is a first of its kind horror comedy, inspired from a true phenomenon'
]

In [46]:
print(corpus)

['Presenting the second video song "Dekhte Dekhte" from the upcoming Hindi movie "Batti Gul Meter Chalu".', 'This Bollywood rendition of Nusrat Fateh Ali Khan original song is done by Rochak Kohli and sung by Atif Aslam.', 'Presenting the first official song Milegi Milegi sung by Mika Singh from the movie Stree.', 'Stree is a first of its kind horror comedy, inspired from a true phenomenon']


In [47]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
cv = CountVectorizer()

In [49]:
vectorized_corpus = cv.fit_transform(corpus).todense()

In [50]:
vectorized_corpus

matrix([[0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0,
         0, 1, 1],
        [1, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
         0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
         0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 2, 0,
         0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
         1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         1, 0, 0]], dtype=int64)

In [51]:
print(cv.vocabulary_)

{'presenting': 32, 'the': 40, 'second': 35, 'video': 44, 'song': 37, 'dekhte': 9, 'from': 13, 'upcoming': 43, 'hindi': 15, 'movie': 26, 'batti': 4, 'gul': 14, 'meter': 23, 'chalu': 7, 'this': 41, 'bollywood': 5, 'rendition': 33, 'of': 28, 'nusrat': 27, 'fateh': 11, 'ali': 0, 'khan': 20, 'original': 30, 'is': 18, 'done': 10, 'by': 6, 'rochak': 34, 'kohli': 22, 'and': 1, 'sung': 39, 'atif': 3, 'aslam': 2, 'first': 12, 'official': 29, 'milegi': 25, 'mika': 24, 'singh': 36, 'stree': 38, 'its': 19, 'kind': 21, 'horror': 16, 'comedy': 8, 'inspired': 17, 'true': 42, 'phenomenon': 31}


In [52]:
# Given a vector what is the sentence
import numpy as np
vector = np.ones((37,))
vector[3:7] = 0
print(vector)

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [53]:
print(cv.inverse_transform(vector))

[array(['ali', 'and', 'aslam', 'chalu', 'comedy', 'dekhte', 'done',
       'fateh', 'first', 'from', 'gul', 'hindi', 'horror', 'inspired',
       'is', 'its', 'khan', 'kind', 'kohli', 'meter', 'mika', 'milegi',
       'movie', 'nusrat', 'of', 'official', 'original', 'phenomenon',
       'presenting', 'rendition', 'rochak', 'second', 'singh'],
      dtype='<U10')]


In [61]:
### Effectively reduce the size of the vector
def myTokenizer(sentence):
    sw = set(stopwords.words('english'))
    words = tokenizer.tokenize(sentence.lower())
    return filter_words(words, sw)

In [63]:
myTokenizer(corpus[0])

['presenting',
 'second',
 'video',
 'song',
 'dekhte',
 'dekhte',
 'upcoming',
 'hindi',
 'movie',
 'batti',
 'gul',
 'meter',
 'chalu']

In [66]:
cv = CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(vc[0])
print(len(vc[0]))

[0 0 0 1 0 1 0 2 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 1 1]
37


In [67]:
v = vc[0]
cv.inverse_transform(v)

[array(['batti', 'chalu', 'dekhte', 'gul', 'hindi', 'meter', 'movie',
        'presenting', 'second', 'song', 'upcoming', 'video'], dtype='<U10')]

## Features in Bag of words model
- Unigram
- Bigram
- Trigram
- N gram

In [70]:
cv = CountVectorizer(tokenizer=myTokenizer, ngram_range=(1,3))
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(cv.vocabulary_)

{'presenting': 77, 'second': 88, 'video': 114, 'song': 94, 'dekhte': 16, 'upcoming': 111, 'hindi': 35, 'movie': 63, 'batti': 6, 'gul': 32, 'meter': 53, 'chalu': 12, 'presenting second': 80, 'second video': 89, 'video song': 115, 'song dekhte': 95, 'dekhte dekhte': 17, 'dekhte upcoming': 19, 'upcoming hindi': 112, 'hindi movie': 36, 'movie batti': 64, 'batti gul': 7, 'gul meter': 33, 'meter chalu': 54, 'presenting second video': 81, 'second video song': 90, 'video song dekhte': 116, 'song dekhte dekhte': 96, 'dekhte dekhte upcoming': 18, 'dekhte upcoming hindi': 20, 'upcoming hindi movie': 113, 'hindi movie batti': 37, 'movie batti gul': 65, 'batti gul meter': 8, 'gul meter chalu': 34, 'bollywood': 9, 'rendition': 82, 'nusrat': 67, 'fateh': 24, 'ali': 0, 'khan': 44, 'original': 73, 'done': 21, 'rochak': 85, 'kohli': 50, 'sung': 104, 'atif': 4, 'aslam': 3, 'bollywood rendition': 10, 'rendition nusrat': 83, 'nusrat fateh': 68, 'fateh ali': 25, 'ali khan': 1, 'khan original': 45, 'original

In [72]:
print(vc) # frequency of words

[[0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 2 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
  1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
  0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 1 1 1 1 1]
 [1 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0
  0 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 1 0
  0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 1 1 1 1 1 0 0 1 0 0 0 1 1
  1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 0 0 1 0 0 1
  1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0
  0 0 1 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0
  0 1 1 0 0 0 0 0 0]]


### TF IDF Normalization
- Avoid features that occur very often, because they contain less information
- Information decreases as the number of occurences increases across different type of document
- So we define the term , term-document-frequency which associates a term with its frequeny

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tdidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer)
