# **Basics of NLP**

# **Text Processing**

# Sentence Tokenization

In [None]:
import nltk                # for NLP
nltk.download('punkt')           # for punctuations
nltk.download('wordnet')             # dictionary like
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
text = "Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice."
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    print(sentence)
    print()

Backgammon is one of the oldest known board games.

Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.

It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.



# Word Tokenization

In [None]:
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)
    print()
  
# dog, dogs, dog's, dogs'
# is, are, am => be 
# drive, driven, drove => drive
# seen, saw, see => see

['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.']

['Its', 'history', 'can', 'be', 'traced', 'back', 'nearly', '5,000', 'years', 'to', 'archeological', 'discoveries', 'in', 'the', 'Middle', 'East', '.']

['It', 'is', 'a', 'two', 'player', 'game', 'where', 'each', 'player', 'has', 'fifteen', 'checkers', 'which', 'move', 'between', 'twenty-four', 'points', 'according', 'to', 'the', 'roll', 'of', 'two', 'dice', '.']



# Stemming and Lemmatization

Stems - chops off the ends of words 
playing -> play
softly -> soft
saw 

Lemmatization - Rule based approach where it will have a vocabulary using which 

Playing -> play
played -> play
saw -> saw

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

# function is for teaching purpose only. Its not required.
def compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word, pos):
    """
    Print the results of stemmind and lemmitization using the passed stemmer, lemmatizer, word and pos (part of speech)
    """
    print("Stemmer:", stemmer.stem(word))
    print("Lemmatizer:", lemmatizer.lemmatize(word, pos))
    print()

# initialize
lemmatizer = WordNetLemmatizer() 
stemmer = PorterStemmer()

# comparing
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "seen", pos = wordnet.VERB)  # pos: is manually specified
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "drove", pos = wordnet.VERB)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "better", pos = wordnet.ADJ)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "improvised", pos = wordnet.VERB)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "Amaze", pos = wordnet.ADJ)

Stemmer: seen
Lemmatizer: see

Stemmer: drove
Lemmatizer: drive

Stemmer: better
Lemmatizer: good

Stemmer: improvis
Lemmatizer: improvise

Stemmer: amaz
Lemmatizer: Amaze



# Stop Words

In [None]:
from nltk.corpus import stopwords
#os.listdir('/root/nltk_data/corpora/stopwords/')
stop_words = set(stopwords.words("english"))      # it gives all the available stopwords in english language
print(stop_words)
"can" in stop_words  # checking

{'it', "wouldn't", 'having', 'has', 'can', 'why', 'such', 't', "haven't", 'them', "shouldn't", 'against', 'does', 'my', 'if', 're', 'o', 'mightn', 'how', 'only', 'during', 'your', 'will', "shan't", 'ain', 'to', "needn't", 'wasn', 'd', "it's", 'myself', 'mustn', "weren't", 'she', 'same', "she's", 'her', 'by', 'had', 'most', 'an', "that'll", 'be', 'in', 'shan', 'yourselves', 'its', 'of', 'weren', 'through', 'hadn', 'with', 'we', "doesn't", 'while', 'but', 'what', "didn't", 'you', "mustn't", 'they', 'aren', 'because', 'shouldn', 'hasn', 'before', 'as', 'off', 'where', 'between', 'own', 'should', 'on', 'nor', 'ourselves', 'out', "you'll", 'herself', 'doing', 'both', 'are', "aren't", 'me', 'don', 'who', 'each', 'ours', 'theirs', 'there', "don't", 'whom', 'under', 'him', "isn't", 'were', "wasn't", 'or', 's', 'have', 'yours', 'he', 'below', "hasn't", 'yourself', 'didn', 'needn', 'which', 'been', 'once', 'was', 'did', 'just', "hadn't", 'these', "you'd", 'this', 'over', 'and', 'all', 'haven', '

True

In [None]:
stop_words = set(stopwords.words("english"))
sentence = "Backgammon is one of the oldest known board games."

words = nltk.word_tokenize(sentence)
without_stop_words = [word for word in words if not word in stop_words]
print(without_stop_words)

['Backgammon', 'one', 'oldest', 'known', 'board', 'games', '.']


In [None]:
'this' in stop_words

True

# Punctuation removal:

In [None]:
# This is an alternate way to word tokenize - advantage : it will remove the punctuation marks

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('\w+')  # initialize 
# \w -> word character[A-Za-z0-9_]
# \w+ -> whenever u see a word character, keep continuing

tokenizer.tokenize('Eight-seven miles to go, yet. Onward!')

['Eight', 'seven', 'miles', 'to', 'go', 'yet', 'Onward']

### Part of Speech Tagging

* automatically finding POS

In [None]:
sentence = "Backgammon is one of the oldest known board games."

words = nltk.word_tokenize(sentence) #tokenizes the sentence into words
tagged = nltk.tag.pos_tag(words)  # nlt.tag.pos_tag(list_of_words)
print(tagged)

[('Backgammon', 'NNP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('oldest', 'JJS'), ('known', 'VBN'), ('board', 'NN'), ('games', 'NNS'), ('.', '.')]


In [None]:
# meaning of tags
nltk.download('tagsets')
nltk.help.upenn_tagset()

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je j

# Bag of Words

In [None]:
#documents = ["I like this movie, it's funny.","I hate this movie.","This was awesome! I like it. Like the screenplay.","Nice one. I love it."]
documents = ['good movie good screenplay', 'good movie', 'bad movie sleep']

In [None]:
# Import the libraries we need
from sklearn.feature_extraction.text import CountVectorizer  # CountVectorizer : 1-gram approach
import pandas as pd

# Step 2. Design the Vocabulary
# The default token pattern removes tokens of a single character. That's why we don't have the "I" and "s" tokens in the output
count_vectorizer = CountVectorizer(binary = False) 

# Step 3. Create the Bag-of-Words Model
bag_of_words = count_vectorizer.fit_transform(documents) # fit - design the vocbulary and transform will convert the text into numbers based on the presence of the word

# Show the Bag-of-Words Model as a pandas DataFrame
feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(bag_of_words.toarray(), columns = feature_names)



Unnamed: 0,bad,good,movie,screenplay,sleep
0,0,2,1,1,0
1,0,1,1,0,0
2,1,0,1,0,1


* binary = True : binary approach
* binary = False : frequency approach

### n - gram Approach

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Step 2. Design the Vocabulary
# The default token pattern removes tokens of a single character. That's why we don't have the "I" and "s" tokens in the output
count_vectorizer = CountVectorizer(ngram_range = (2,3), binary=True) 

# Step 3. Create the Bag-of-Words Model
bag_of_words = count_vectorizer.fit_transform(documents) # fit - design the vocbulary and transform will convert the text into numbers based on the presence of the word

# Show the Bag-of-Words Model as a pandas DataFrame
feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(bag_of_words.toarray(), columns = feature_names)



Unnamed: 0,bad movie,bad movie sleep,good movie,good movie good,good screenplay,movie good,movie good screenplay,movie sleep
0,0,0,1,1,1,1,1,0
1,0,0,1,0,0,0,0,0
2,1,1,0,0,0,0,0,1


* ngram_range(2,3) : 2-gram as well as 3-gram approach
* ngram_range(1,3) : 1-gram + 2-gram + 3-gram approach

# TF-IDF

In [None]:
#documents = ["I like this movie, it's funny.","I hate this movie.","This was awesome! I like it. Like the screenplay.","Nice one. I love it."]
documents = ['good movie good screenplay', 'good movie', 'bad movie sleep']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf_vectorizer = TfidfVectorizer() # Intialise the model 
values = tfidf_vectorizer.fit_transform(documents)

# Show the Model as a pandas DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
pd.DataFrame(values.toarray(), columns = feature_names)



Unnamed: 0,bad,good,movie,screenplay,sleep
0,0.0,0.794803,0.308618,0.522535,0.0
1,0.0,0.789807,0.613356,0.0,0.0
2,0.652491,0.0,0.385372,0.0,0.652491
