# Natural Language Processing With Python's NLTK Package

### Getting Started With Python’s NLTK

## **Install NLTK**

In [None]:
pip install nltk==3.5

In [None]:
import nltk
nltk.download('punkt')

### Tokenizing

In [None]:
example_string = """You don't have to be French to enjoy a decent red wine," Charles Jousselin de Gruse used to tell his foreign guests whenever he entertained them in Paris. "But you do have to be French to recognize one," he would add with a laugh. After a lifetime in the French diplomatic corps, the Count de Gruse lived with his wife in an elegant townhouse on Quai Voltaire. He was a likeable man, cultivated of course, with a well-deserved reputation as a generous host and an amusing raconteur."""

#### Tokenizing by Sentence

In [None]:
from nltk.tokenize import sent_tokenize

sentences=sent_tokenize(example_string)
print(sentences)

In [None]:
for s in sentences:
  print(s)

#### Tokenizing by word

In [None]:
from nltk.tokenize import word_tokenize

words=word_tokenize(example_string)
print(words)
for w in words:
  print(w)


In [None]:
data=[]
for s in sentences:
  data.append(word_tokenize(s))
print(data)

### Filtering Stop Words

In [None]:
nltk.download("stopwords")

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
stop_words

In [None]:
worf_quote = "Sir, I protest. I am not a merry man!"

In [None]:
words_in_quote = word_tokenize(worf_quote)
words_in_quote

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
stop_words

In [None]:
worf_quote = "Sir, I protest. I am not a merry man!"
words_in_quote = word_tokenize(worf_quote)
words_in_quote

In [None]:

filtered_list = []
for word in words_in_quote:
    if word.casefold() not in stop_words:
        filtered_list.append(word)
print(filtered_list)

In [None]:
# prompt: remove special charecters in filtered_list

import re

cleaned_list = []
for word in filtered_list:
  cleaned_word = re.sub(r'[^\w\s]', '', word) # Remove special characters
  if cleaned_word: # Append only if the word is not empty after cleaning
    cleaned_list.append(cleaned_word)

print(cleaned_list)


### Stemming

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# PorterStemmer is an algorithm

In [None]:
stemmer = PorterStemmer()  #steamer is an object of porterstemmer

In [None]:
singleword= stemmer.stem("scarves")
singleword

In [None]:
string_for_stemming = """
The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""

In [None]:
words = word_tokenize(string_for_stemming)
print(words)

In [None]:
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)

## Lemmatizing

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
 nltk.download('wordnet')

In [None]:
nltk.download('omw-1.4')


In [None]:
print(lemmatizer.lemmatize("better",pos="a"))

In [None]:
print(lemmatizer.lemmatize("scarves"))

In [None]:
string_for_lemmatizing ="""
The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""

In [None]:
words = word_tokenize(string_for_lemmatizing)
print(words)

In [None]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print(lemmatized_words)

In [None]:
lemmatizer.lemmatize("worst")

In [None]:
lemmatizer.lemmatize("worst", pos="a")

In [None]:
lemmatizer.lemmatize("changed")


In [None]:
lemmatizer.lemmatize("changed", pos="v")

## Tagging Parts of Speech

In [None]:
sagan_quote = """
If you wish to make an apple pie from scratch,
you must first invent the universe."""

In [None]:
words_in_sagan_quote = word_tokenize(sagan_quote)
print(words_in_sagan_quote)

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
nltk.pos_tag(words_in_sagan_quote)

In [None]:
nltk.download('tagsets')

nltk.help.upenn_tagset()

In [None]:
jabberwocky_excerpt = """
'Twas brillig, and the slithy toves did gyre and gimble in the wabe:
all mimsy were the borogoves, and the mome raths outgrabe."""

In [None]:
words_in_excerpt = word_tokenize(jabberwocky_excerpt)

In [None]:
nltk.pos_tag(words_in_excerpt)

## Named Entity Recognition

In [None]:
nltk.download('maxent_ne_chunker')

In [None]:
sample_text = "F. Henly was born in San Francisco and he works at Microsoft."
tokens = nltk.word_tokenize(sample_text)
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

In [None]:
import nltk
nltk.download('words')

In [None]:
entities = nltk.ne_chunk(tagged_tokens)
print(entities)

# Spacy
## Code Description

This code uses the spaCy library to find variations of the term "solar power" in a given text.

1. It imports the necessary modules and loads the English language model.
2. It creates a Matcher object to define patterns for matching.
3. Three patterns are specified:
   - The exact term "solarpower" (all lowercase).
   - The two words "solar" and "power" as separate tokens.
   - The two words "solar" and "power" with a punctuation mark in between.
4. These patterns are added to the matcher under the label `SolarPower`.
5. The code processes a sample sentence and finds all occurrences of the specified patterns.
6. Finally, it prints the matches found, showing their unique identifier and position in the text.


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

## Part of Speech Basics

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')

# Get the text
text = "The quick brown fox jumped over the lazy dog."

# Create a Doc object
doc = nlp(text)

# Get the part-of-speech tags
tags = [token.tag_ for token in doc]

# Print the tags
print(tags)

# Visualize the part-of-speech tags
displacy.render(doc, style='dep',jupyter=True, options={'distance': 90})


##Named Entity Recogntion

In [None]:
text = "President Barack Obama gave a speech at the White House."

nlp = spacy.load('en_core_web_sm')

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

displacy.render(doc, style='ent', jupyter=True)


# Text Feature Extraction

## N-grams

In [None]:
from textblob import TextBlob

In [None]:
nltk.download('punkt_tab')

In [None]:
example_string = """This is the first document.",
             "This document is the second document.",
             "And this is the third one.",
             "Is this the first document"""

In [None]:
TextBlob(example_string).ngrams(3)

## Bag of Words(BOW) model

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
corpus = ["This is the first document.",
             "This document is the second document.",
             "And this is the third one.",
             "Is this the first document?",
          'They love NLP but can not learn in two months'
          'i love india and i hate terrorism',
          'i love terrorism and i hate india']

In [None]:
vectorizer = CountVectorizer()

In [None]:
# Fit the vectorizer to the text corpus and transform it into a feature matrix
X = vectorizer.fit_transform(corpus)
print(X)

In [None]:
# Get the feature names (n-grams)
feature_names = vectorizer.get_feature_names_out()
print("Feature Names (n-grams):")
print(feature_names)

In [None]:
# Convert the feature matrix to an array for better readability
feature_matrix = X.toarray()

print("\nFeature Matrix:")
print(feature_matrix)

In [None]:
df = pd.DataFrame(data=feature_matrix,columns = vectorizer.get_feature_names_out())
print(df)

In [None]:
text2 = ['They love NLP but can not learn in two months']
vectorizer.transform(text2).toarray()

In [None]:

text = ["food was not bad","I am not feeling bad"]
vectorizer = CountVectorizer(ngram_range = (1,2))
count_matrix = vectorizer.fit_transform(text)
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())
print(df)

## Term Frequency – Inverse Document Frequency (TF-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
text = ["i love NLP NLP",
        "NLP NLP is the future",
        "i will learn the NLP"]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text)
count_array = matrix.toarray()
df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())
print(df)

In [None]:
'''
text =["This is the first document.",
        "This document is the second document.",
        "And this is the third one.",
        "Is this the first document"]
'''
'''
text = ["read  svm algorithm article dataaspirant blog",
        "read randomforest algorithm article dataaspirant blog"]
'''
text =["petrol cars  cheaper  diesel cars",
        "diesel cheaper  petrol"]
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(text)
count_array = matrix.toarray()
df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())
print(df)

In [None]:
corpus=["petrol cars  are cheaper than diesel cars",
        "diesel is cheaper  than petrol"]

## Word Embeddings

In [None]:
!pip install -U gensim

In [None]:
!pip install numpy==1.24.3

In [None]:
X=[d.split() for d in text]
X

In [None]:
#convert text into the word2vec format.
import gensim
#from gensim.models import Word2Vec
w2vecmodel=gensim.models.Word2Vec(sentences=X,vector_size=2,window=4,min_count=1)

Now, we can load the above word2vec file as a model.

In [None]:
print(w2vecmodel)

In [None]:
words = list(w2vecmodel.wv.index_to_key)
print(words)

In [None]:
from gensim.models import Word2Vec

# Sample data
sentences = [
    ['this', 'is', 'the', 'first', 'document'],
    ['this', 'document', 'is', 'the', 'second', 'document'],
    ['and', 'this', 'is', 'the', 'third', 'one'],
    ['is', 'this', 'the', 'first', 'document']
]

# Initialize the Word2Vec model
model = Word2Vec(sentences, vector_size=5, window=5, min_count=1, workers=4)

# Train the model
model.train(sentences, total_examples=len(sentences), epochs=10)

# Get vector for a word
print(model.wv['document'])
print("second = ",model.wv['second'])

In [None]:
#convert text into the word2vec format.
import gensim
text = ["I love, love, love the NLP",
        "NLP is the future",
        "I will learn the NLP"]
X=[d.split() for d in text]
w2vecmodel=gensim.models.Word2Vec(sentences=X,vector_size=2,window=4,min_count=1)

Now, we can load the above word2vec file as a model.

In [None]:
print(w2vecmodel)

In [None]:
words = w2vecmodel.wv.index_to_key
print(words)

In [None]:
words = w2vecmodel.wv.key_to_index
print(words.keys())

In [None]:
print(words.values())

In [None]:
print(w2vecmodel.wv.vectors)

In [None]:
import matplotlib.pyplot as plt
y = w2vecmodel.wv.vectors[:,0]
x = w2vecmodel.wv.vectors[:,1]
labels = words.keys()

fig, ax = plt.subplots()
ax.scatter(x, y)

for i, txt in enumerate(labels):
    ax.annotate(txt, (x[i], y[i]))