In [None]:
!pip install nltk

In [2]:
import nltk

In [None]:
nltk.download()

In [1]:
# Corpus- A large collection of data
from nltk.corpus import brown

In [4]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [5]:
data=brown.sents(categories='adventure')

In [7]:
len(data)

4637

In [9]:
print(data[2])

['He', 'certainly', "didn't", 'want', 'a', 'wife', 'who', 'was', 'fickle', 'as', 'Ann', '.']


In [11]:
' '.join(data[2])

"He certainly didn't want a wife who was fickle as Ann ."

### Bag of Words Pipeline
- get the Data/Corpus
- Tokenisation/Stopward Removal
- Stemming
- Building a Vocab
- Vectorization
- Classsification

In [12]:
# Bag of words converts words to numbers
# These numbers can then be fed to classifier to determine sentiment
# token-Data to Sentence to Words
# Stopward removal-Remove the words which makes no sense as an individual
# like was had, I, they, Stemming like playing and play to play to Base verb
# Group these words to make a vocab, then give numbers as vectorization
# then mark them in vectors and then feed to classifier

## Tokenization

In [34]:
document='''It was a very pleasant day. The weather was very cool and there were light showers. I went to the market to buy some'''

sentence="Send all the documents to soumyjain14@gmail.com"

In [None]:
nltk.download('punkt')

In [14]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [14]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [35]:
sent=sent_tokenize(document)

In [38]:
sent

['It was a very pleasant day.',
 'The weather was very cool and there were light showers.',
 'I went to the market to buy some']

In [29]:
wor=word_tokenize(sentence)

In [33]:
wor

['Send', 'all', 'the', 'documents', 'to', 'soumyjain14', '@', 'gmail.com']

### Stopwards

In [39]:
from nltk.corpus import stopwords

In [40]:
sw=set(stopwords.words('english'))

In [43]:
print(len(sw))

179


In [44]:
# Custom made function 
def remove_stopwards(text,stopward):
    useful_words=[w for w in text if w not in stopward]
    return useful_words

In [55]:
text='He is not bothered by her character'.lower().split()
remove_stopwards(text,sw)

['bothered', 'character']

In [56]:
# Another method to tokenize-Regular Expressions method

In [57]:
from nltk.tokenize import RegexpTokenizer

In [79]:
tokenizer=RegexpTokenizer('[a-zA-Z@.14]+')

In [80]:
sent=tokenizer.tokenize(sentence)

In [81]:
sent

['Send', 'all', 'the', 'documents', 'to', 'soumyjain14@gmail.com']

### Stemming
- Snowball
- Porter 
- lancaster Stemmer

In [82]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [83]:
pc=PorterStemmer()

In [86]:
pc.stem('jumping')

'jump'

In [87]:
pc.stem('lovely')

'love'

In [93]:
## lemmatiztion-same as Stemming
from nltk.stem import WordNetLemmatizer

In [94]:
lc=WordNetLemmatizer()

In [95]:
lc.lemmatize('jumpg')

'jumpg'

### Vectorization

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
cv=CountVectorizer()

In [4]:
corpus=['Virat kohli is the captain of the Indian cricket team.',
'Prime Minister of our country is Modi.',
       'the nobel laurate won the price']

In [5]:
vectorized=cv.fit_transform(corpus)

In [6]:
vectorized

<3x18 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [7]:
vectorized=vectorized.toarray()

In [8]:
vectorized

array([[1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 2, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 1]],
      dtype=int64)

In [9]:
cv.vocabulary_

{'virat': 16,
 'kohli': 5,
 'is': 4,
 'the': 15,
 'captain': 0,
 'of': 10,
 'indian': 3,
 'cricket': 2,
 'team': 14,
 'prime': 13,
 'minister': 7,
 'our': 11,
 'country': 1,
 'modi': 8,
 'nobel': 9,
 'laurate': 6,
 'won': 17,
 'price': 12}

In [113]:
vectorized[1] --> for 2nd sentence

array([0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0], dtype=int64)

In [10]:
numbers=vectorized[2]
print(cv.inverse_transform(numbers))

[array(['laurate', 'nobel', 'price', 'the', 'won'], dtype='<U8')]


In [11]:
print(len(vectorized[1]))

18


In [13]:
len(corpus[1])

38

In [14]:
# Due to vectorization, the length of characters reduced from 38 to 18

In [16]:
cv.inverse_transform(vectorized)

[array(['captain', 'cricket', 'indian', 'is', 'kohli', 'of', 'team', 'the',
        'virat'], dtype='<U8'),
 array(['country', 'is', 'minister', 'modi', 'of', 'our', 'prime'],
       dtype='<U8'),
 array(['laurate', 'nobel', 'price', 'the', 'won'], dtype='<U8')]

### More ways to create features
- Unigram
- Bigram- Combining two words together to get essence like not good
- trigram-Combining threewords together
- ngram

In [17]:
cv=CountVectorizer()

In [18]:
sent_1=["This is a good movie"]
sent_2=["This is not a good movie"]

In [19]:
docs=[sent_1[0],sent_2[0]]

In [22]:
vector=cv.fit_transform(docs)

In [24]:
vector=vector.toarray()

In [25]:
vector

array([[1, 1, 1, 0, 1],
       [1, 1, 1, 1, 1]], dtype=int64)

In [26]:
cv.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'not': 3}

## Bigram

In [27]:
cv=CountVectorizer(ngram_range=(2,2)) #ngram_range=(1,1) for unigram by default

In [29]:
vector=cv.fit_transform(docs)

In [30]:
vector=vector.toarray()

In [31]:
cv.vocabulary_

{'this is': 4, 'is good': 1, 'good movie': 0, 'is not': 2, 'not good': 3}

In [32]:
# For trigram range is 3,3
# For n-gram the range is (1,3), one word to three word all are included

In [34]:
cv=CountVectorizer(ngram_range=(1,3)) #ngram_range=(1,1) for unigram by default

vector=cv.fit_transform(docs)

vector=vector.toarray()

print(cv.vocabulary_)

{'this': 11, 'is': 2, 'good': 0, 'movie': 7, 'this is': 12, 'is good': 3, 'good movie': 1, 'this is good': 13, 'is good movie': 4, 'not': 8, 'is not': 5, 'not good': 9, 'this is not': 14, 'is not good': 6, 'not good movie': 10}


## Tf-Idf Normalization

In [35]:
# Term Frequency-Inverse Document Frequency
# Weights are assigned to them
# If more times occured then less importance and less weight
# If less occured then more weightage

In [36]:
sent_1=['This is a good movie']
sent_2=['This was not a good movie']
sent_3=['This was a good movie not good acting though']

In [37]:
corpus=[sent_1[0],sent_2[0],sent_3[0]]

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
tfidf=TfidfVectorizer()

In [42]:
tfi=tfidf.fit_transform(corpus).toarray()

In [44]:
print(tfi)

[[0.         0.41285857 0.69903033 0.41285857 0.         0.41285857
  0.         0.        ]
 [0.         0.3978967  0.         0.3978967  0.51236445 0.3978967
  0.         0.51236445]
 [0.43644562 0.51554363 0.         0.25777181 0.33192814 0.25777181
  0.43644562 0.33192814]]


In [45]:
tfidf.vocabulary_

{'this': 5,
 'is': 2,
 'good': 1,
 'movie': 3,
 'was': 7,
 'not': 4,
 'acting': 0,
 'though': 6}