#Common NLP Techniques
#Let’s explore some common NLP techniques and how they are implemented using Python.

#Tokenization
#Tokenization is the process of breaking down text into individual words or phrases.

In [2]:
import nltk
from nltk.tokenize import word_tokenize

# Ensure you have the necessary NLTK data
nltk.download('punkt')

text = "Natural Language Processing is fascinating."
tokens = word_tokenize(text)
print(tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Natural', 'Language', 'Processing', 'is', 'fascinating', '.']


#Stemming and Lemmatization
#Stemming and lemmatization are techniques used to reduce words to their base form.

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # For the WordNet lemmatizer

text = "Natural Language Processing is fascinating."
tokens = word_tokenize(text)

# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
print("Stemmed words:", stemmed_words)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatized words:", lemmatized_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Stemmed words: ['natur', 'languag', 'process', 'is', 'fascin', '.']
Lemmatized words: ['Natural', 'Language', 'Processing', 'is', 'fascinating', '.']


#Stop Words Removal
#Removing common stop words to focus on meaningful words.

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

text = "Natural Language Processing is fascinating."
tokens = word_tokenize(text)

# Filtering stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered tokens:", filtered_tokens)

# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed words:", stemmed_words)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized words:", lemmatized_words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Filtered tokens: ['Natural', 'Language', 'Processing', 'fascinating', '.']
Stemmed words: ['natur', 'languag', 'process', 'fascin', '.']
Lemmatized words: ['Natural', 'Language', 'Processing', 'fascinating', '.']


[nltk_data]   Package omw-1.4 is already up-to-date!


#Named Entity Recognition (NER)
#Identifying and classifying named entities in text.

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


#Part-of-Speech (POS) Tagging
#Assigning grammatical tags to words in a sentence.

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

text = "Natural Language Processing is fascinating."
tokens = word_tokenize(text)

# Part-of-speech tagging
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('is', 'VBZ'), ('fascinating', 'VBG'), ('.', '.')]
