## Basics of NLP

#### The main motive of NLP is to convert words into numerical vectors so that we can do operations on that

In [3]:
# pip install sklearn

#### 1. Bags of words Approach

In [13]:
# Importing libraries
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
# Test Set
# Objective is to convert the text into numerical vector

train_x = ['i love the book',
          'this is a great book',
          'the fit is great',
          'i love the shoes']

In [16]:
# Instantiating
# Unigram Model
vectorizer = CountVectorizer()

# Bigram model
# vectorizer = CountVectorizer(binary=True, ngram=(1,2))

# Training
train_x_vectors = vectorizer.fit_transform(train_x)

# getting all feature names
print(vectorizer.get_feature_names_out())

# Numerical Vector Array of the text
print(train_x_vectors.toarray())

['book' 'fit' 'great' 'is' 'love' 'shoes' 'the' 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]


###### One point to note that this vectorizer ignores single letter by default hence, a and i are ignored

In [17]:
### Now we are going forward to classify the data
# Let's declare our train classifier

class Category:
    BOOKS = 'BOOKS'
    CLOTHING = 'CLOTHING'
    
train_y = [Category.BOOKS, 
           Category.BOOKS, 
           Category.CLOTHING, 
           Category.CLOTHING]

# Basically we have classified each train_x line to a category

In [18]:
# Training on Category

from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [19]:
# Predicting on unknown values

new_x = vectorizer.transform(['i like this book'])
clf_svm.predict(new_x)

array(['BOOKS'], dtype='<U8')

In [20]:
new_x = vectorizer.transform(['i like this shoes'])
clf_svm.predict(new_x)

array(['CLOTHING'], dtype='<U8')

In [None]:
### If it hasn't seen a word it will predict badly. That should be Noted

#### 2. Word Vectors using SpaCy approach

In [1]:
# pip install spacy
# !python -m spacy download en_core_web_md

In [2]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_md')

In [7]:
print(train_x)

['i love the book', 'this is a great book', 'the fit is great', 'i love the shoes']


In [22]:
docs = [nlp(text) for text in train_x]
train_x_word_vectors = [x.vector for x in docs]

In [None]:
# print(docs[1].vector)

In [23]:
clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_word_vectors, train_y)

SVC(kernel='linear')

In [32]:
test_x = ['this is a story of a king']
test_docs = [nlp(text) for text in test_x]
test_x_wv = [x.vector for x in test_docs]
clf_svm_wv.predict(test_x_wv)

array(['BOOKS'], dtype='<U8')

In [None]:
## Using spacy library we can predict much more words
## It is really powerful

#### Regexes for NLP

In [33]:
# example: start with 'ab'
# no whitespaces
# end with 'cd'

import re
regexp = re.compile(r'^ab[^\s]*cd$')
test = ['abcd', 'xx', 'ab cd', 'abghsgcd']

match=[]
for i in test:
    if re.match(regexp, i):
        match.append(i)
        
match

['abcd', 'abghsgcd']

In [None]:
## If we want to search whether our regex is in the word we can use re.search()

In [None]:
# Search for words in phrases
# re.compile('\bread|\bstory|\bbook')
# \b word boundary

#### 3. Stemming/Lemmatization using NLTK

In [37]:
# !pip install nltk

In [45]:
import nltk

# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('omw-1.4')

##### Stemming

In [42]:
# Takes sentence and breaks it into words
from nltk.tokenize import word_tokenize

# Stems words (normalizes)
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

phrase = 'I am reading a book and writing'
words = word_tokenize(phrase)

stemmed_words = []
for word in words:
    stemmed_words.append(stemmer.stem(word))
print(stemmed_words)

" ".join(stemmed_words)

# You should avoid punctuations in phrase

['i', 'am', 'read', 'a', 'book', 'and', 'write']


'i am read a book and write'

##### Lemmatizing

In [50]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

phrase = 'I am reading a book and writing'
words = word_tokenize(phrase)

lemmatized_words = []
for word in words:
    lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))
print(lemmatized_words)

" ".join(lemmatized_words)

['I', 'be', 'read', 'a', 'book', 'and', 'write']


'I be read a book and write'

In [None]:
### Reducing verbs back to base word

#### 4. Stop Words Removal

In [None]:
## They are the common words im English

In [55]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
print(len(stop_words))
print(stop_words)

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [57]:
# Removing stopwords

phrase = 'I am going to the market to buy some clothes'

words = word_tokenize(phrase)

stripped_words = []
for word in words:
    if word not in stop_words:
        stripped_words.append(word)
print(stripped_words)

" ".join(stripped_words)


['I', 'going', 'market', 'buy', 'clothes']


'I going market buy clothes'

##### Some other techniques

##### Spell Correction, Sentiment, Pos tagging

In [60]:
# pip install textblob

In [70]:
# Spell Correction

from textblob import TextBlob

phrase = 'this is a bad exampelee'

tb_phrase = TextBlob(phrase)
print(tb_phrase.correct())

this is a bad example


In [66]:
# !python -m textblob.download_corpora

In [71]:
tb_phrase.tags

[('this', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('bad', 'JJ'),
 ('exampelee', 'NN')]

In [None]:
## NN means Noun, VBZ means Verb, DT is determiner

In [72]:
tb_phrase.sentiment

Sentiment(polarity=-0.6999999999999998, subjectivity=0.6666666666666666)

In [None]:
# Negative polarity due to use of 'bad'