# Basics in NLP

In [None]:
import nltk
import spacy
nlp = spacy.load('en_core_web_sm')

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
example_text="The Indian Space Research Organisation or is the national space agency of India, headquartered in Bengaluru. It operates under Department of Space which is directly overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well."


## Tokenization and Segmentation

SpaCy

In [None]:
doc = nlp(example_text)
token_listSC = []
for token in doc:
    token_listSC.append(token.text)

print(token_listSC)

['The', 'Indian', 'Space', 'Research', 'Organisation', 'or', 'is', 'the', 'national', 'space', 'agency', 'of', 'India', ',', 'headquartered', 'in', 'Bengaluru', '.', 'It', 'operates', 'under', 'Department', 'of', 'Space', 'which', 'is', 'directly', 'overseen', 'by', 'the', 'Prime', 'Minister', 'of', 'India', 'while', 'Chairman', 'of', 'ISRO', 'acts', 'as', 'executive', 'of', 'DOS', 'as', 'well', '.']


NTLK

In [None]:
tokens = nltk.word_tokenize(example_text)

In [None]:
p="In 2014 the Notre Dame student body consisted of 12,179 students, with 8,448 undergraduates, 2,138 graduate and professional and 1,593 professional (Law, M.Div., Business, M.Ed.)students. Around 21â€“24% of students are children of alumni, and although 3.7% of students come from the Midwestern United States, the student body represents all 50 states and 100 countries. As of March 2007[update] The Princeton Review ranked the school as the fifth highest 'dream school' for parents to send their children. As of March 2015[update] The Princeton Review ranked Notre Dame as the ninth highest. The school has been previously criticized for its lack of diversity, and The Princeton Review ranks the university highly among schools at which \"Alternative Lifestyles [are] Not an Alternative.\" It has also been commended by some diversity oriented publications; Hispanic Magazine in 2004 ranked the university ninth on its list of the topâ€“25 colleges for Latinos, and The Journal of Blacks in Higher Education recognized the university in 2006 for raising enrollment of African-American students. With 6,000 participants, the university's intramural sports program was named in 2004 by Sports Illustrated as the best program in the country, while in 2007 The Princeton Review named it as the top school where \"Everyone Plays Intramural Sports.\" The annual Bookstore Basketball tournament is the largest outdoor five-on-five tournament in the world with over 700 teams participating each year, while the Notre Dame Men's Boxing Club hosts the annual Bengal Bouts tournament that raises money for the Holy Cross Missions in Bangladesh."

In [None]:
from nltk import tokenize

In [None]:
tokenize.sent_tokenize(p)

['In 2014 the Notre Dame student body consisted of 12,179 students, with 8,448 undergraduates, 2,138 graduate and professional and 1,593 professional (Law, M.Div., Business, M.Ed.)students.',
 'Around 21â€“24% of students are children of alumni, and although 3.7% of students come from the Midwestern United States, the student body represents all 50 states and 100 countries.',
 "As of March 2007[update] The Princeton Review ranked the school as the fifth highest 'dream school' for parents to send their children.",
 'As of March 2015[update] The Princeton Review ranked Notre Dame as the ninth highest.',
 'The school has been previously criticized for its lack of diversity, and The Princeton Review ranks the university highly among schools at which "Alternative Lifestyles [are] Not an Alternative."',
 'It has also been commended by some diversity oriented publications; Hispanic Magazine in 2004 ranked the university ninth on its list of the topâ€“25 colleges for Latinos, and The Journal o

## Stemming and Lemmatization

SpaCy

In [None]:


#Tokenization and lemmatization are done with the spacy nlp pipeline commands
lemma_list = []
for token in doc:
    lemma_list.append(token.lemma_)


print(lemma_list)



['the', 'Indian', 'Space', 'Research', 'Organisation', 'or', 'be', 'the', 'national', 'space', 'agency', 'of', 'India', ',', 'headquarter', 'in', 'Bengaluru', '.', 'it', 'operate', 'under', 'Department', 'of', 'Space', 'which', 'be', 'directly', 'oversee', 'by', 'the', 'Prime', 'Minister', 'of', 'India', 'while', 'Chairman', 'of', 'ISRO', 'act', 'as', 'executive', 'of', 'DOS', 'as', 'well', '.']


NTLK

In [None]:
p_stemmer = PorterStemmer()

nltk_tokenList = word_tokenize(example_text)

nltk_stemedList = []
for word in nltk_tokenList:
    nltk_stemedList.append(p_stemmer.stem(word))


print(nltk_stemedList)



['the', 'indian', 'space', 'research', 'organis', 'or', 'is', 'the', 'nation', 'space', 'agenc', 'of', 'india', ',', 'headquart', 'in', 'bengaluru', '.', 'it', 'oper', 'under', 'depart', 'of', 'space', 'which', 'is', 'directli', 'overseen', 'by', 'the', 'prime', 'minist', 'of', 'india', 'while', 'chairman', 'of', 'isro', 'act', 'as', 'execut', 'of', 'do', 'as', 'well', '.']


In [None]:

wordnet_lemmatizer = WordNetLemmatizer()

nltk_lemmaList = []
for word in nltk_stemedList:
    nltk_lemmaList.append(wordnet_lemmatizer.lemmatize(word))

print(nltk_lemmaList)

['the', 'indian', 'space', 'research', 'organis', 'or', 'is', 'the', 'nation', 'space', 'agenc', 'of', 'india', ',', 'headquart', 'in', 'bengaluru', '.', 'it', 'oper', 'under', 'depart', 'of', 'space', 'which', 'is', 'directli', 'overseen', 'by', 'the', 'prime', 'minist', 'of', 'india', 'while', 'chairman', 'of', 'isro', 'act', 'a', 'execut', 'of', 'do', 'a', 'well', '.']


## Stop Words Removal

SpaCy

In [None]:
#Filter the stopword Spacy
filtered_sentence =[]
for word in lemma_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_sentence.append(word)



NTLK

In [None]:


nltk_stop_words = set(stopwords.words('english'))
# print(nltk_stop_words)

for w in nltk_lemmaList:
    if w not in nltk_stop_words:
        filtered_sentence.append(w)

print(filtered_sentence)


['Indian', 'Space', 'Research', 'Organisation', 'national', 'space', 'agency', 'India', ',', 'headquarter', 'Bengaluru', '.', 'operate', 'Department', 'Space', 'directly', 'oversee', 'Prime', 'Minister', 'India', 'Chairman', 'ISRO', 'act', 'executive', 'DOS', '.', 'indian', 'space', 'research', 'organis', 'nation', 'space', 'agenc', 'india', ',', 'headquart', 'bengaluru', '.', 'oper', 'depart', 'space', 'directli', 'overseen', 'prime', 'minist', 'india', 'chairman', 'isro', 'act', 'execut', 'well', '.']


SKLearn

In [None]:
from sklearn.feature_extraction import text
stopwordsSK = text.ENGLISH_STOP_WORDS
print(sorted(list(stopwordsSK)))

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give

## Punctuation

In [None]:

#Remove punctuation
punctuations="?:!.,;"
for word in filtered_sentence:
    if word in punctuations:
        filtered_sentence.remove(word)
print("Remove stopword & punctuation: ")
print(filtered_sentence)

Remove stopword & punctuation: 
['Indian', 'Space', 'Research', 'Organisation', 'national', 'space', 'agency', 'India', 'headquarter', 'Bengaluru', 'operate', 'Department', 'Space', 'directly', 'oversee', 'Prime', 'Minister', 'India', 'Chairman', 'ISRO', 'act', 'executive', 'DOS', 'indian', 'space', 'research', 'organis', 'nation', 'space', 'agenc', 'india', 'headquart', 'bengaluru', 'oper', 'depart', 'space', 'directli', 'overseen', 'prime', 'minist', 'india', 'chairman', 'isro', 'act', 'execut', 'well']


## Part-of-speech POS Tagging

In [None]:
%%time


# Print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_, token.lemma_)

The DET det the
Indian PROPN compound Indian
Space PROPN compound Space
Research PROPN compound Research
Organisation PROPN nsubj Organisation
or CCONJ cc or
is AUX ROOT be
the DET det the
national ADJ amod national
space NOUN compound space
agency NOUN attr agency
of ADP prep of
India PROPN pobj India
, PUNCT punct ,
headquartered VERB acl headquarter
in ADP prep in
Bengaluru PROPN pobj Bengaluru
. PUNCT punct .
It PRON nsubj it
operates VERB ROOT operate
under ADP prep under
Department PROPN pobj Department
of ADP prep of
Space PROPN pobj Space
which PRON nsubjpass which
is AUX auxpass be
directly ADV advmod directly
overseen VERB relcl oversee
by ADP agent by
the DET det the
Prime PROPN compound Prime
Minister PROPN pobj Minister
of ADP prep of
India PROPN pobj India
while SCONJ mark while
Chairman PROPN nsubj Chairman
of ADP prep of
ISRO PROPN pobj ISRO
acts VERB advcl act
as ADP prep as
executive NOUN pobj executive
of ADP prep of
DOS PROPN pobj DOS
as ADV advmod as
well ADV advmo

In [None]:
The DET det the
Indian PROPN compound Indian
Space PROPN compound Space
Research PROPN compound Research
Organisation PROPN nsubj Organisation
or CCONJ cc or
is AUX ROOT be
the DET det the
national ADJ amod national
space NOUN compound space
agency NOUN attr agency
of ADP prep of
India PROPN pobj India
, PUNCT punct ,
headquartered VERB acl headquarter
in ADP prep in
Bengaluru PROPN pobj Bengaluru
. PUNCT punct .
It PRON nsubj it
operates VERB ROOT operate
under ADP prep under
Department PROPN pobj Department
of ADP prep of
Space PROPN pobj Space
which PRON nsubjpass which
is AUX auxpass be
directly ADV advmod directly
overseen VERB relcl oversee
by ADP agent by
the DET det the
Prime PROPN compound Prime
Minister PROPN pobj Minister
of ADP prep of
India PROPN pobj India
while SCONJ mark while
Chairman PROPN nsubj Chairman
of ADP prep of
ISRO PROPN pobj ISRO
acts VERB advcl act
as ADP prep as
executive NOUN pobj executive
of ADP prep of
DOS PROPN pobj DOS
as ADV advmod as
well ADV advmod well
. PUNCT punct .
CPU times: user 12.8 ms, sys: 1.04 ms, total: 13.8 ms
Wall time: 18.1 ms

## Name-Entity_recognition NER

In [None]:
for word in doc.ents:
    print(word.text,word.label_)



The Indian Space Research Organisation ORG
India GPE
Bengaluru GPE
Department of Space ORG
India GPE
ISRO ORG
DOS ORG


# Additional Resources

[NTLK Documentation](https://www.nltk.org/)

[Spacy vs NTLK ](https://medium.com/@akankshamalhotra24/introduction-to-libraries-of-nlp-in-python-nltk-vs-spacy-42d7b2f128f2)

[Why we still need Text Normalization?](https://towardsdatascience.com/text-normalization-for-natural-language-processing-nlp-70a314bfa646#:~:text=Why%20do%20we%20need%20text,with%2C%20and%20therefore%20improves%20efficiency.)

[NER](https://www.analyticsvidhya.com/blog/2021/06/nlp-application-named-entity-recognition-ner-in-python-with-spacy/)

[Lab References1](https://lvngd.com/blog/text-normalization-natural-language-processing-python/)

[Lab References2](https://towardsdatascience.com/text-normalization-with-spacy-and-nltk-1302ff430119)

[Lab References3](https://towardsdatascience.com/a-guide-to-cleaning-text-in-python-943356ac86ca)

[PyTorch vs TensorFlow1](https://www.simplilearn.com/keras-vs-tensorflow-vs-pytorch-article#:~:text=TensorFlow%20offers%20better%20visualization%2C%20which,to%20the%20TensorFlow%20Serving%20framework.)  


[PyTorch vs TensorFlow2](https://towardsdatascience.com/pytorch-vs-tensorflow-spotting-the-difference-25c75777377b)


My Info:


*   mayar.osama@guc.edu.eg
*   C3.306

