In [12]:
## NLTK-  Natural Language Toolkit
import nltk

In [13]:
from nltk.corpus import brown

In [14]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [15]:
data = brown.sents(categories='fiction')
' '.join(data[1])

'Scotty did not go back to school .'

#### Bag of Words Pipeline¶
* Get the Data/Corpus
* Tokenisation, Stopward Removal
* Stemming
* Building a Vocab
* Vectorization
* Classification

In [19]:

document = """Neuralink Corporation is an American neurotechnology company founded by Elon Musk and others,developing implantable brain–machine interfaces. The company's headquarters are in San Francisco; it was started in 2016 and was first publicly reported in March 2017."""

sentence = "Neuralink- Developing ultra high bandwidth brain-machine interfaces to connect humans and computers."

In [20]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [21]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['Neuralink Corporation is an American neurotechnology company founded by Elon Musk and others,developing implantable brain–machine interfaces.', "The company's headquarters are in San Francisco; it was started in 2016 and was first publicly reported in March 2017."]
2


In [22]:
sents[0]

'Neuralink Corporation is an American neurotechnology company founded by Elon Musk and others,developing implantable brain–machine interfaces.'

In [23]:
sentence.split()

['Neuralink-',
 'Developing',
 'ultra',
 'high',
 'bandwidth',
 'brain-machine',
 'interfaces',
 'to',
 'connect',
 'humans',
 'and',
 'computers.']

In [25]:
 word_tokenize(sentence)

['Neuralink-',
 'Developing',
 'ultra',
 'high',
 'bandwidth',
 'brain-machine',
 'interfaces',
 'to',
 'connect',
 'humans',
 'and',
 'computers',
 '.']

In [26]:
words = word_tokenize(sentence)

### STOPWORDS

In [27]:
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

In [28]:
print(sw)

{'having', 'own', "needn't", 'in', 'than', 'wasn', 'nor', 'did', 'these', 'your', 'myself', 'herself', 'needn', 'should', 'how', "aren't", 'which', 'isn', 'her', 'this', 'over', 'have', 'doing', 'down', 'who', 'so', 'shouldn', 'if', "hasn't", 'any', 'other', 'under', 'wouldn', 'not', 'both', 'our', 'shan', "you'll", 'doesn', 'him', 'don', 'haven', "weren't", 'there', 'hadn', 'through', 'himself', 'off', "shan't", 'such', 'i', 'at', "you're", 'o', 'an', "didn't", 'me', 'does', 'below', 'its', 'no', 'by', 'mightn', 'weren', 'hasn', 'before', "shouldn't", 'as', "it's", 'hers', 'it', 'mustn', "wouldn't", "she's", 'about', 'between', "hadn't", 'was', 'yourself', 'now', "don't", 'my', 'until', 'after', 'with', 'during', 'the', 'm', "that'll", "couldn't", 'some', 'd', 'will', 'yours', 'had', 'you', "you've", 'on', 'because', 'then', 'were', 'out', 'each', "doesn't", 'more', 'themselves', 'only', "you'd", 'can', 'whom', 'ma', "won't", 'here', 'again', 'aren', 'they', "mustn't", 'against', 'did

In [29]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [30]:
text = "This is not a good movie".split()
useful_text = remove_stopwords(text,sw)
print(useful_text)

['This', 'good', 'movie']


##### Tokenization using Regular Expression

In [32]:
from nltk.tokenize import RegexpTokenizer

In [33]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence)

In [34]:
useful_text

['Neuralink',
 'Developing',
 'ultra',
 'high',
 'bandwidth',
 'brain',
 'machine',
 'interfaces',
 'to',
 'connect',
 'humans',
 'and',
 'computers.']

#### STEMMING

In [37]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [38]:
ps = PorterStemmer()

In [39]:
ps.stem("jumping")

'jump'

In [40]:
ps.stem("driving")

'drive'

In [41]:
ss = SnowballStemmer('english')

In [43]:
ss.stem('jumping')

'jump'

In [44]:
ss.stem("driving")

'drive'

In [48]:
ss = SnowballStemmer('french')

In [49]:
ss.stem("sauter")

'saut'

In [51]:
ss.stem("dansant")

'dans'

In [52]:
## Lemmatization
from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

#### Building a Vocabulary & Vectorization

In [57]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [59]:
vectorized_corpus = cv.fit_transform(corpus)

In [60]:
vectorized_corpus = vectorized_corpus.toarray()

In [61]:
len(vectorized_corpus[0])
print(vectorized_corpus[0])

[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 0 1 0 2]


In [62]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [63]:
len(cv.vocabulary_.keys())

42

In [64]:
# Reverse Mapping!
numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
      dtype=int64)

In [65]:
s = cv.inverse_transform(numbers)
print(s)

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


In [67]:
## Vectorization with Stopword Removal¶

def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    words = remove_stopwords(words,sw)
    return words

In [69]:
myTokenizer(sentence)

['neuralink',
 'developing',
 'ultra',
 'high',
 'bandwidth',
 'brain',
 'machine',
 'interfaces',
 'connect',
 'humans',
 'computers.']

In [70]:
print(sentence)

Neuralink- Developing ultra high bandwidth brain-machine interfaces to connect humans and computers.


In [71]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [73]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [74]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [75]:
print(len(vectorized_corpus[0]))

33


In [76]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [81]:
# For Test Data
test_corpus = [
        "Virat Kohli has been one of the most successful captains in Indian cricket history"        
]
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

#### More ways to Create Features
* Unigram - every word as a feature
* Bigrams
* Trigrams
* n-grams
* TF-IDF Normalisation

In [82]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]

In [86]:
cv = CountVectorizer(ngram_range=(1,2))
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1],
       [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [87]:
cv.vocabulary_

{'this': 14,
 'is': 6,
 'good': 4,
 'movie': 9,
 'this is': 15,
 'is good': 7,
 'good movie': 5,
 'but': 2,
 'actor': 0,
 'not': 11,
 'present': 13,
 'movie but': 10,
 'but actor': 3,
 'actor is': 1,
 'is not': 8,
 'not present': 12}

In [88]:
cv = CountVectorizer(ngram_range=(1,3))

In [89]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]], dtype=int64)

In [90]:
cv.vocabulary_

{'this': 20,
 'is': 9,
 'good': 6,
 'movie': 14,
 'this is': 21,
 'is good': 10,
 'good movie': 7,
 'this is good': 22,
 'is good movie': 11,
 'but': 3,
 'actor': 0,
 'not': 17,
 'present': 19,
 'movie but': 15,
 'but actor': 4,
 'actor is': 1,
 'is not': 12,
 'not present': 18,
 'good movie but': 8,
 'movie but actor': 16,
 'but actor is': 5,
 'actor is not': 2,
 'is not present': 13}

#### Tf-idf Normalisation

In [91]:
sent_1  = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [93]:
tfidf = TfidfVectorizer()
vc = tfidf.fit_transform(corpus).toarray()

In [94]:
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [95]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}