## Natural Language Processing using NLTK

In [1]:
from nltk.corpus import brown

### Data collection

In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
data = brown.sents(categories='editorial')[:100]
print(type(data), len(data))
print(data)
print(len(data))

<class 'nltk.collections.LazySubsequence'> 100
[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]
100


# NLP Pipeline
- Data Collection 
- Tokenization, Stopwards Removal, Stemming
- Building a common vocab 
- Vectorize the documents 
- Performing Classification/Clustering

## 2. Tokenization and Stopword Removal

In [4]:
text = "It was a very pleasant day, the weather was cool and there were showers. I went to market to buy some fruits."

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [6]:
sents = sent_tokenize(text)

In [7]:
print(sents)

['It was a very pleasant day, the weather was cool and there were showers.', 'I went to market to buy some fruits.']


In [8]:
word_list = word_tokenize(sents[0].lower())
print(word_list)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'showers', '.']


## Stopwords removal

In [9]:
from nltk.corpus import stopwords

In [10]:
sw = set(stopwords.words('english'))

In [11]:
print(sw, len(sw))

{'haven', 'this', 'just', 'don', "weren't", 'did', 'ma', 'as', 'those', 'ain', 'are', 'didn', 'mightn', 'our', 'you', 'these', "you've", "it's", 'too', 'being', 'their', "aren't", 'he', 'we', 'hers', 'where', 'won', 'with', "hasn't", 'themselves', "haven't", 'or', 'they', 'against', 'hadn', 'both', 'who', 'll', 'but', 're', 'couldn', 'isn', 'does', 'only', 'why', "wouldn't", 's', 'which', 'before', 'into', 'there', "won't", 'ours', 'has', 'that', 'how', 'itself', 'off', "hadn't", 'most', 'the', 'down', 'not', 'all', 'his', 'them', 'than', 'wouldn', 'if', 'after', 'other', 'yours', 'same', 've', "mustn't", 'very', 'can', 'each', 'hasn', 'doing', 'yourself', 'am', 'weren', 'for', 'do', 'my', 'have', "shouldn't", 'of', 'above', 'in', "you'd", 'mustn', 'such', 'it', 'herself', 'through', "needn't", "wasn't", 'o', 'what', 'were', 'should', 'nor', 'until', 't', 'doesn', "should've", 'him', 'i', 'on', 'aren', 'theirs', 'd', 'about', "mightn't", "didn't", 'further', 'myself', 'is', 'then', 'wi

## Filter the words from the sentence 

In [12]:
def filter_words(word_list):
    
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

In [13]:
useful_words = filter_words(word_list)
print(useful_words)

['pleasant', 'day', ',', 'weather', 'cool', 'showers', '.']


In [14]:
from nltk.tokenize import RegexpTokenizer

In [15]:
tokenizer = RegexpTokenizer("[a-zA-Z0-9]+")

In [16]:
sents = "send the 50 documents to abc, def, ghi."
print(tokenizer.tokenize(sents))

['send', 'the', '50', 'documents', 'to', 'abc', 'def', 'ghi']


# Stemming
- Process that transforms particular words into root words
- jumping, jump, jumps, jumped => jump

In [17]:
text = "The quick brown fox was seen jumping over the lazy dog from high wall. Foxes love to make jumps"

In [18]:
word_list = tokenizer.tokenize(text.lower())
print(word_list)

['the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lazy', 'dog', 'from', 'high', 'wall', 'foxes', 'love', 'to', 'make', 'jumps']


## Types of Stemmers 
- Snowball Stemmer (Multilingual)
- Porter Stemmer 
- Lancaster Stemmer 

In [19]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [20]:
ps = PorterStemmer()

In [21]:
ps.stem("jumped")
ps.stem("jumping")

'jump'

In [22]:
ps.stem("lovely")

'love'

In [23]:
ps.stem("awesome")
ls = LancasterStemmer()
ls.stem("awesome")

print(ls.stem("teenager"))
print(ps.stem("teenager"))

teen
teenag


In [24]:
ss = SnowballStemmer('french')
ss.stem('courais')

'cour'

# Bag of Words

In [25]:
corpus = [
    'Indian cricket team will win world cup, says caption virat kohli, World cup will be held at India in next year.',
    'We will win next Lok Sabha Election, says Indian PM',
    'The nobel Rabindranath tagore won the hearts of the people', 
    'The movie Raazi is an exciting thriller based upon real story'
]

In [26]:
print(corpus)

['Indian cricket team will win world cup, says caption virat kohli, World cup will be held at India in next year.', 'We will win next Lok Sabha Election, says Indian PM', 'The nobel Rabindranath tagore won the hearts of the people', 'The movie Raazi is an exciting thriller based upon real story']


In [27]:
# I want to convert words into numerical features 
# Building a common vocabulary and vectorize the documents 

In [28]:
def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return filter_words(words)

list_words = myTokenizer(corpus[3])
print(len(list_words))

8


In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
cv = CountVectorizer(tokenizer=myTokenizer,ngram_range=(1,3))

In [48]:
vectorized_corpus = cv.fit_transform(corpus)

In [49]:
vc = vectorized_corpus.toarray()

In [50]:
print(vc[1])
print(cv.vocabulary_)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1
 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
{'indian': 28, 'cricket': 6, 'team': 70, 'win': 82, 'world': 87, 'cup': 9, 'says': 61, 'caption': 3, 'virat': 79, 'kohli': 32, 'held': 22, 'india': 25, 'next': 41, 'year': 91, 'indian cricket': 29, 'cricket team': 7, 'team win': 71, 'win world': 85, 'world cup': 88, 'cup says': 12, 'says caption': 62, 'caption virat': 4, 'virat kohli': 80, 'kohli world': 33, 'cup held': 10, 'held india': 23, 'india next': 26, 'next year': 44, 'indian cricket team': 30, 'cricket team win': 8, 'team win world': 72, 'win world cup': 86, 'world cup says': 90, 'cup says caption': 13, 'says caption virat': 63, 'caption virat kohli': 5, 'virat kohli world': 81, 'kohli world cup': 34, 'world cup held': 89, 'cup held india': 11, 'held india next': 24, 'india next year': 27, 'lok': 35, 'sabha': 58, 'election': 14, 'pm': 49, 'wi

In [38]:
cv.inverse_transform(vc[0])

[array(['caption', 'cricket', 'cup', 'held', 'india', 'indian', 'kohli',
        'next', 'says', 'team', 'virat', 'win', 'world', 'year'],
       dtype='<U12')]

# TF-IDF

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer, ngram_range=(1,2))

In [57]:
vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)
print(tfidf_vectorizer.vocabulary_)

[[0.         0.         0.1678685  0.1678685  0.1678685  0.1678685
  0.335737   0.1678685  0.1678685  0.         0.         0.
  0.         0.         0.         0.1678685  0.1678685  0.1678685
  0.1678685  0.13234945 0.1678685  0.         0.1678685  0.1678685
  0.         0.         0.         0.         0.13234945 0.
  0.1678685  0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.13234945 0.1678685  0.         0.         0.
  0.         0.1678685  0.1678685  0.         0.         0.
  0.         0.1678685  0.1678685  0.13234945 0.         0.1678685
  0.335737   0.335737   0.1678685 ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.27230302 0.27230302 0.
  0.         0.         0.         0.         0.         0.
  0.         0.21468683 0.         0.27230302 0.         0.
  0.27230302 0.27230302 0.         0.         0.21468683 0.27230302
  0.         0.         0.  