In [1]:
#Install and Import NLTK library (Natural Language Toolkit)
!pip install nltk



In [2]:
# Download necessary NLTK data files
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\u\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\u\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\u\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#take an example text to perform pre processing steps
text="""Natural language processing is an exciting field of artificial intelligence 
that focuses on the interaction between computers and humans through language. 
It involves various techniques to analyze and understand human language."""

## Tokenization

Tokenization is the process of breaking down text into smaller units called tokens. These tokens can be words, subwords, or characters, depending on the level of granularity needed for a particular task. Tokenization is a crucial step in natural language processing (NLP) and text analysis because it converts raw text into a structured format that algorithms can work with.

In [4]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [5]:
#Sentence Tokenized
sent_tokenized=sent_tokenize(text)

In [6]:
for sentence in sent_tokenized:
    print("***********************************")
    print(sentence)


***********************************
Natural language processing is an exciting field of artificial intelligence 
that focuses on the interaction between computers and humans through language.
***********************************
It involves various techniques to analyze and understand human language.


In [7]:
#word tokenize
text_tokenized=word_tokenize(text)

In [8]:
print(text_tokenized)

['Natural', 'language', 'processing', 'is', 'an', 'exciting', 'field', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'language', '.', 'It', 'involves', 'various', 'techniques', 'to', 'analyze', 'and', 'understand', 'human', 'language', '.']


## Stop Words

The words which are generally filtered out out before processing a natural language are called stop words . These are the words which do not carry meaningful information about the content of the text. Stop words are used to remove noise from the data and speed up the computation process. Ex:the", "is", "at", "which", "and", "on", "in", "of", "to", etc., we can even include punctuations here

To perform the stop words operation, we will use the NLTK library. NLTK stands for Natural Language Toolkit. It is a leading platform for building Python programs to work with human language data, particularly in the field of natural language processing (NLP). NLTK provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, among other NLP tasks.

In [9]:
#Stop words removal
from nltk.corpus import stopwords

In [10]:
stop_words=stopwords.words("english")

In [11]:
len(stop_words)

179

In [12]:
text_without_stopwords=[word for word in text_tokenized if word not in stop_words]

In [13]:
print("The length of text before removing stop words",len(text_tokenized))
print("The length of text after removing stop words",len(text_without_stopwords))

The length of text before removing stop words 33
The length of text after removing stop words 22


## Stemming


In natural language processing (NLP), stemming is the process of reducing words to their root or base form, also known as the stem. The main goal of stemming is to reduce inflected words to their common base form, which can help improve text analysis and information retrieval tasks by treating different forms of a word as the same entity.

For example, stemming would convert words like "running", "runs", and "ran" to the common base form "run". Similarly, words like "play", "playing", and "played" would all be stemmed to "play".

Stemming algorithms typically work by removing suffixes from words to obtain the root form. These algorithms are rule-based and operate by applying a series of rules to trim off common suffixes. However, stemming algorithms do not always produce accurate or linguistically valid results, as they may sometimes produce stems that are not actual words or may result in stems that are not semantically related.

Despite its limitations, stemming is still widely used in NLP tasks such as text normalization, information retrieval, and document clustering. It can help reduce the dimensionality of text data and improve the performance of certain text processing tasks. Popular stemming algorithms include the Porter Stemmer and the Snowball Stemmer.

In [14]:
from nltk.stem import PorterStemmer

In [15]:
stemmer=PorterStemmer()

In [16]:
print(stemmer.stem("changing"))
#It removes its suffixes and changes it to chang

chang


In [17]:
words=["change","changed","changing"]

In [18]:
for word in words:
    print(word,"<--->",stemmer.stem(word))

change <---> chang
changed <---> chang
changing <---> chang


## Lemmatization

Lemmatization, like stemming, is a natural language processing (NLP) technique used to reduce words to their base or dictionary form, known as the lemma. However, unlike stemming, lemmatization considers the context and meaning of a word when determining its lemma.

In [19]:
from nltk.stem import WordNetLemmatizer

In [20]:
lemmatizer=WordNetLemmatizer()

In [21]:
print(lemmatizer.lemmatize("changing",pos="v"))
#pos(Parts of speech which we have chosen as verb("v") so it changes it into verb base form)

change


In [22]:
words=["change","changed","changing"]

In [23]:
for word in words:
    print(word,"<--->",lemmatizer.lemmatize(word,pos="v"))

change <---> change
changed <---> change
changing <---> change


## Name Entity Recognition(NER)

Named Entity Recognition (NER) is a natural language processing (NLP) technique that aims to identify and classify named entities within a text into predefined categories such as the names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc. The primary goal of NER is to extract and label these entities to provide structure and meaning to unstructured text data.

For instance, in the sentence "John works at Google in California", NER would identify "John" as a PERSON entity, "Google" as an ORGANIZATION entity, and "California" as a LOCATION entity

In [24]:
!pip install spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     --------------------------------------- 0.0/42.8 MB 393.8 kB/s eta 0:01:49
     --------------------------------------- 0.1/42.8 MB 819.2 kB/s eta 0:00:53
     ---------------------------------------- 0.2/42.8 MB 1.1 MB/s eta 0:00:38
     ---------------------------------------- 0.3/42.8 MB 1.1 MB/s eta 0:00:38
     ---------------------------------------- 0.5/42.8 MB 1.7 MB/s eta 0:00:25
      --------------------------------------- 0.7/42.8 MB 2.1 MB/s eta 0:00:20
      --------------------------------------- 0.9/42.8 MB 2.4 MB/s eta 0:00:18
      --------------------------------------- 1.0/42.8 MB 2.5 MB/s eta 0:00:17
     - -------------------------------------- 

In [25]:
import spacy

In [26]:
nlp=spacy.load("en_core_web_md")

In [27]:
#Example text
txt="""
John Smith works at XYZ Corporation located in New York City.
He is currently leading a project on artificial intelligence. 
Last week, he attended a conference on machine learning in San Francisco.
His colleague, Sarah Johnson, presented their team's research on natural language processing.
"""

In [28]:
doc=nlp(txt)

In [29]:
print(spacy.displacy.render(doc,style="ent"))

None


In [30]:
print (doc.ents)

(John Smith, XYZ Corporation, New York City, Last week, San Francisco, Sarah Johnson)


In [31]:
for entity in doc.ents:
    print(entity,"<---->",entity.label_)

John Smith <----> PERSON
XYZ Corporation <----> ORG
New York City <----> GPE
Last week <----> DATE
San Francisco <----> GPE
Sarah Johnson <----> PERSON


In [32]:
#To understand about these entities
spacy.explain("GPE")

'Countries, cities, states'

In [33]:
doc


John Smith works at XYZ Corporation located in New York City.
He is currently leading a project on artificial intelligence. 
Last week, he attended a conference on machine learning in San Francisco.
His colleague, Sarah Johnson, presented their team's research on natural language processing.

In [34]:
for word in doc:
    print(word,"<--->",word.pos_)


 <---> SPACE
John <---> PROPN
Smith <---> PROPN
works <---> VERB
at <---> ADP
XYZ <---> PROPN
Corporation <---> PROPN
located <---> VERB
in <---> ADP
New <---> PROPN
York <---> PROPN
City <---> PROPN
. <---> PUNCT

 <---> SPACE
He <---> PRON
is <---> AUX
currently <---> ADV
leading <---> VERB
a <---> DET
project <---> NOUN
on <---> ADP
artificial <---> ADJ
intelligence <---> NOUN
. <---> PUNCT

 <---> SPACE
Last <---> ADJ
week <---> NOUN
, <---> PUNCT
he <---> PRON
attended <---> VERB
a <---> DET
conference <---> NOUN
on <---> ADP
machine <---> NOUN
learning <---> NOUN
in <---> ADP
San <---> PROPN
Francisco <---> PROPN
. <---> PUNCT

 <---> SPACE
His <---> PRON
colleague <---> NOUN
, <---> PUNCT
Sarah <---> PROPN
Johnson <---> PROPN
, <---> PUNCT
presented <---> VERB
their <---> PRON
team <---> NOUN
's <---> PART
research <---> NOUN
on <---> ADP
natural <---> ADJ
language <---> NOUN
processing <---> NOUN
. <---> PUNCT

 <---> SPACE


In [35]:
#Combine all this pre processing techniques and make a function
#Importing libraries
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words=stopwords.words("english")
import spacy
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [36]:
def preprocessing(sentence):
    #Removing special characters
    text=re.sub(r"[^a-z0-9A-Z ]","",sentence).lower()
    #Tokenize data
    text_tokenized=word_tokenize(text)
    #Remove stop words
    text_without_stopwords=[word for word in text_tokenized if word not in stop_words]
    #Joining the sentence
    text_without_stopwords=" ".join(text_without_stopwords)
    #lemmatizing words
    nlp=spacy.load("en_core_web_md")
    doc=nlp(text_without_stopwords)
    
    updated_words=[lemmatizer.lemmatize(word.text,pos="v") if word.pos_=="VERB" else word.text for word in doc]
    
    #Returning the preprocessed text
    return " ".join(updated_words)   

In [37]:
sentence = "I was amazed looking at his phone have to say iam impressed"


In [38]:
Testing the model
print(preprocessing(sentence))

amazed look phone say iam impress
