In [1]:
import nltk
import pandas as pd
import numpy as np

# Important
- Learn Regular Expression
- Tokenization

# **Text Cleaning**

In [9]:
import re

def clean_text_special_characters(text):
    # Lowercase the text
    text = text.lower()

    # Remove special characters using regex
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Example usage
original_text = "Hello!!! This text -- contains 😊 various ### special characters and junk words @like these!!!"
cleaned_text = clean_text_special_characters(original_text)
print("Original Text:", original_text)
print("Cleaned Text:", cleaned_text)

Original Text: Hello!!! This text -- contains 😊 various ### special characters and junk words @like these!!!
Cleaned Text: hello this text contains various special characters and junk words like these


In [10]:
text = "Good morning 😊! Let's grab a coffee"
text.encode("utf-8")

b"Good morning \xf0\x9f\x98\x8a! Let's grab a coffee"

# Spell Checking

In [11]:
!pip install Textblob



In [12]:
# Spelling Chekcing
from textblob import TextBlob

def correct_spelling(text):
    # Create a TextBlob object
    blob = TextBlob(text)

    # Correct the spelling
    corrected_text = blob.correct()

    return str(corrected_text)

# Example usage
original_text = "I havv a spelinng mistak in thiss sentnce."
corrected_text = correct_spelling(original_text)
print("Original Text:", original_text)
print("Corrected Text:", corrected_text)

Original Text: I havv a spelinng mistak in thiss sentnce.
Corrected Text: I have a spelling mistake in this sentence.


# Removing HTML Tags, URL, ABR-Word Phrase

In [13]:
import re
import string
from bs4 import BeautifulSoup # For Web Scraping

# Dictionary for chat abbreviations
chat_abbr_dict = {
    "u": "you",
    "ur": "your",
    "r": "are",
    "gr8": "great",
    "b4": "before",
    "idk": "I don't know",
    "ttyl": "talk to you later",
    # Add more abbreviations as needed
}

def clean_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Expand chat abbreviations
    words = text.split()
    expanded_words = [chat_abbr_dict[word] if word in chat_abbr_dict else word for word in words]
    text = ' '.join(expanded_words)

    return text

# Example usage
original_text = "Hey, check this out: <a href='http://example.com'>example</a>! It's gr8, ttyl!"
cleaned_text = clean_text(original_text)
print("Original Text:", original_text)
print("Cleaned Text:", cleaned_text)

Original Text: Hey, check this out: <a href='http://example.com'>example</a>! It's gr8, ttyl!
Cleaned Text: hey check this out example its great talk to you later


# **Text - Feature Engineering**

### Tokenization
Tokenization is the process of splitting text into smaller units called tokens. Tokens can be words, sentences, or even subwords, depending on the level of tokenization. Tokenization is a fundamental step in text preprocessing for NLP tasks.

#### Types
- Word Tokenization
- Sentence Tokenization
- Sub-Word Tokenization (tokenization : 'toke','iza','tion')

In [27]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [17]:
from nltk.tokenize import word_tokenize

text = "Hello How are you"
words = word_tokenize(text)
print(words)

['Hello', 'How', 'are', 'you']


In [18]:
from nltk.tokenize import sent_tokenize

text = "Hello How are you. I hope you're doing well."
sentences = sent_tokenize(text)
print(sentences)

['Hello How are you.', "I hope you're doing well."]


In [22]:
# Regular Expression
word = "tokenization"
subword_tokens = [word[i:i+2] for i in range(0, len(word), 2)]
print(f"\nExample: \"{word}\" → {subword_tokens}")


Example: "tokenization" → ['to', 'ke', 'ni', 'za', 'ti', 'on']


- Corpus - Full Text Book
- Document - A Single Line
- Vocabulary - List of Unique Words in my Text

# Stemming and Lemmatization
- Running, Run : Run
- Running, Changed - Prefix/Suffix : 'ing','ed','ation'

## Stemming:
- Changed : Chang
- Running : run

## Lemmatization:
- run, running, runs : run
- changed, changing : change


In [23]:
from nltk.stem import PorterStemmer

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Example words
words = ["running", "ran", "runs", "easily", "fairly","Changed","Changing"]

# Apply stemming
stemmed_words = [stemmer.stem(word) for word in words]

print("Original Words:", words)
print("Stemmed Words:", stemmed_words)

Original Words: ['running', 'ran', 'runs', 'easily', 'fairly', 'Changed', 'Changing']
Stemmed Words: ['run', 'ran', 'run', 'easili', 'fairli', 'chang', 'chang']


In [29]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Example words
words = ["running", "ran", "runs", "easily", "fairly", "Changed","Changing"]

# Function to get WordNet POS tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Apply lemmatization
lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]

print("Original Words:", words)
print("Lemmatized Words:", lemmatized_words)

Original Words: ['running', 'ran', 'runs', 'easily', 'fairly', 'Changed', 'Changing']
Lemmatized Words: ['run', 'ran', 'run', 'easily', 'fairly', 'Changed', 'Changing']


#### Stemming and Lemmatization
- Stemming words sometime has meaning or sometimes not, but lemmatization words always has meaning or valid words.
- Stemming is based on removing suffixes from words, so its fast. But Lemmatization involves word analysis and pos tagging so its slow and complex.
- Stemming has less accuracy word wise, but Lemmatization has more accuracy.

#### Use Cases
- Stemming can be used of Search Engine or Search Application Tasks.
- Lemmazation can be used for Sentiment Analysis, Text Classification or Text Analytics.

# **Text Representation**
- One hot Encoding
- Bag of Words
- Bag of Words (N-Grams)
- TF-IDF
- Word2Vec


### Steps -
- Building the Vocabulary
- Operations

ONE HOT Encoding
-------------------
Text = "I love machine learning"
unique_words = ["I","love","machine","learning"]
Assign_index: {"I":1,"love":2,"machine":3,"learning":4}

"I" - [1,0,0,0]
"love" - [0,1,0,0]
"machine" - [0,0,1,0]
"learning" - [0,0,0,1]

Issues - Sparsity, Length of Vectors


BAG OF WORDS
-------------------
Text = "I love machine learning, and I also love deep learning"
unique_words
assign_index

Representation:
[2, 2, 1, 2, 1, 1, 1]

Advantage over OHE - It capture Some Semantic Meaning.
Issues - Out of Vocab, Length of Vectors




In [30]:
# One Hot Encoding
# Define the text
text = "I love machine learning"

# Tokenize the text into words
words = text.split()

# Create the vocabulary
vocabulary = list(set(words))
vocab_size = len(vocabulary)

# Create a mapping from word to index
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}

# One-hot encode the words
def one_hot_encode(word, word_to_index, vocab_size):
    one_hot_vector = [0] * vocab_size
    one_hot_vector[word_to_index[word]] = 1
    return one_hot_vector

# One-hot encode the entire text
one_hot_encoded_text = [one_hot_encode(word, word_to_index, vocab_size) for word in words]

print("Vocabulary:", vocabulary)
print("One-Hot Encoded Text:")
for word, one_hot_vector in zip(words, one_hot_encoded_text):
    print(f"{word}: {one_hot_vector}")

Vocabulary: ['love', 'learning', 'I', 'machine']
One-Hot Encoded Text:
I: [0, 0, 1, 0]
love: [1, 0, 0, 0]
machine: [0, 0, 0, 1]
learning: [0, 1, 0, 0]


In [31]:
from collections import Counter

# Define the text
text = "I love machine learning, and I also love deep learning"

# Tokenize the text into words
words = text.split()

# Count word frequencies
word_frequencies = Counter(words)

# Create the vocabulary
vocabulary = list(word_frequencies.keys())
vocab_size = len(vocabulary)

# Create a mapping from word to index
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}

# Create Bag of Words representation
bag_of_words = [word_frequencies[word] for word in vocabulary]

print("Vocabulary:", vocabulary)
print("Bag of Words Representation:", bag_of_words)

Vocabulary: ['I', 'love', 'machine', 'learning,', 'and', 'also', 'deep', 'learning']
Bag of Words Representation: [2, 2, 1, 1, 1, 1, 1, 1]



# BAG OF WORDS (N-Grams)


In [37]:
from sklearn.feature_extraction.text import CountVectorizer

# Define the corpus
corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "I love natural language processing.",
    "Machine learning is an exciting field.",
    "The quick fox is quick."
]

# Create a CountVectorizer object with n-grams
# Here, we use ngram_range=(1, 2) for unigrams and bigrams
vectorizer = CountVectorizer(ngram_range=(2, 2))

# Fit the vectorizer to the corpus and transform the corpus into a BoW representation
bow_representation = vectorizer.fit_transform(corpus)

# Get the feature names (words and n-grams)
feature_names = vectorizer.get_feature_names_out()

# Print the feature names
print("Vecobulary:")
print(vectorizer.vocabulary_)

# Print the BoW representation
print("\nBag of Words Representation:")
print(bow_representation.toarray())

Vecobulary:
{'the quick': 18, 'quick brown': 15, 'brown fox': 1, 'fox jumps': 4, 'jumps over': 7, 'over the': 14, 'the lazy': 17, 'lazy dog': 9, 'love natural': 11, 'natural language': 13, 'language processing': 8, 'machine learning': 12, 'learning is': 10, 'is an': 5, 'an exciting': 0, 'exciting field': 2, 'quick fox': 16, 'fox is': 3, 'is quick': 6}

Bag of Words Representation:
[[0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 0 1 1]
 [0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0]
 [1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1]]


- Text 1 - Hello You are good
- Text 2 - Hello You are not good

### TF_IDF
- TF - Term Freq
- IDF - Inverse Document Freq


skip = 2
total words = 10
TF = 2/10

IDF = log(N/Num.of Documents)

TD_IDF = TF * IDF


Document 1: "The quick brown fox jumps over the lazy dog." , 2/9, 1/9, 1/9
Document 2: "A brown fox is quick fox." 2/6, 1/6, 1/6

IDF = log(1/2) = log(A/B)

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the corpus
corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "A brown fox is a quick fox."
]

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the corpus and transform the corpus into a TF-IDF representation
tfidf_representation = vectorizer.fit_transform(corpus)

# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Print the feature names
print("Feature Names (Terms):")
print(feature_names)

# Print the TF-IDF representation
print("\nTF-IDF Representation:")
print(tfidf_representation.toarray())

Feature Names (Terms):
['brown' 'dog' 'fox' 'is' 'jumps' 'lazy' 'over' 'quick' 'the']

TF-IDF Representation:
[[0.2306165  0.32412345 0.2306165  0.         0.32412345 0.32412345
  0.32412345 0.2306165  0.6482469 ]
 [0.35409974 0.         0.70819948 0.49767483 0.         0.
  0.         0.35409974 0.        ]]


## Word2Vec
- Solves - Length, Sparsity, High Dimensionality, Semantic meaning
- How - Fix Lenght, Dense Array, Embeddings for the words


/        Man Women Boy Girl

King
Prince
Queen
Princess

In [39]:
from gensim.models import Word2Vec

# Example sentences
sentences = [
    ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"],
    ["the", "lazy", "dog", "sleeps"],
    ["the", "quick", "brown", "cat", "jumps"]
]

# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Find similarity between two words
word1 = "dog"
word2 = "cat"
similarity = model.wv.similarity(word1, word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity:.2f}")

# Find similar words to a given word
word = "fox"
similar_words = model.wv.most_similar(word)
print(f"Words similar to '{word}': {similar_words}")

Similarity between 'dog' and 'cat': -0.11
Words similar to 'fox': [('the', 0.21615321934223175), ('over', 0.04468922317028046), ('cat', 0.0019510718993842602), ('lazy', -0.03278514742851257), ('sleeps', -0.09326908737421036), ('dog', -0.09579558670520782), ('quick', -0.10513807833194733), ('jumps', -0.16937021911144257), ('brown', -0.17323409020900726)]


In [40]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import string

# Read the text from the file
with open("001ssb.txt", "r") as file:
    text = file.read()

# Tokenize the text into words
tokens = word_tokenize(text.lower())

# Remove punctuation from tokens
tokens = [word.translate(str.maketrans('', '', string.punctuation)) for word in tokens if word.isalpha()]

# Train a Word2Vec model
model = Word2Vec([tokens], vector_size=100, window=5, min_count=1, sg=1)

# Find word embeddings
word_embeddings = model.wv

# Print word embeddings for selected words
print("Word Embeddings:")
selected_words = ["dead", "songs", "royce", "night", "forest"]
for word in selected_words:
    print(word, ":", word_embeddings[word])

# Find similarity between two same words
word1 = "dead"
word2 = "dead"
similarity = model.wv.similarity(word1, word2)
print(f"\nSimilarity between '{word1}' and '{word2}': {similarity}")

Word Embeddings:
dead : [-1.36613557e-02 -5.22106234e-03 -3.98949236e-02  1.01797052e-01
 -6.61216453e-02 -2.95142502e-01  3.50258164e-02  4.37840164e-01
 -1.72982708e-01 -8.76349434e-02  3.96527722e-02 -2.08202541e-01
 -1.90473929e-01  7.31824711e-02 -1.15222797e-01 -8.30630660e-02
  1.61287189e-02 -1.58948332e-01 -7.70116225e-02 -3.50763917e-01
  9.63376164e-02  1.85087517e-01  3.75155061e-01 -2.01797560e-01
  3.81664708e-02  1.42399848e-01 -2.06275687e-01 -3.29811610e-02
 -1.43613338e-01 -6.28686696e-02  2.57043809e-01 -3.93779278e-02
  2.07556352e-01 -2.49498665e-01  1.23824090e-01 -2.62888856e-02
 -2.66149770e-02 -2.93261945e-01  4.97550890e-02 -4.12374198e-01
  7.35615566e-02 -2.60034263e-01  8.15649256e-02  1.56080619e-01
  3.99069190e-02 -1.20949045e-01 -3.47807497e-01  1.98426411e-01
  5.59718199e-02  1.61480367e-01  3.71649079e-02 -2.32466012e-02
  3.12347598e-02 -1.19775543e-02  5.80865629e-02  1.10150173e-01
  1.87940747e-02 -8.21881965e-02  5.22226021e-02  2.34655499e-01
 

In [41]:
word1 = "iron"
word2 = "sword"
similarity = model.wv.similarity(word1, word2)
print(f"\nSimilarity between '{word1}' and '{word2}': {similarity}")


Similarity between 'iron' and 'sword': 0.9987897276878357


In [42]:
# Find most similar words to a given word
word = "bran"
most_similar_words = model.wv.most_similar(word, topn=5)
print(f"\nMost similar words to '{word}':")
for similar_word, score in most_similar_words:
    print(f"{similar_word}: {score}")


Most similar words to 'bran':
face: 0.9988780617713928
greyjoy: 0.9988757371902466
father: 0.9988554120063782
your: 0.9988180994987488
he: 0.9988071322441101
