# NLP Phase-1: Text Processing

In [1]:
# Text processing pipeline
# Text -> Cleaning -> Tokenization -> Stopwords Removal -> Stemming/Lemmetization -> Vectorization -> ML Model

In [5]:
text = "i am learning Python programming and python is fast!!.."
print("Original text :",text)

Original text : i am learning Python programming and python is fast!!..


In [6]:
# Step-1: Convert to lowercase
text = text.lower()
# Python != python
print("Lowercase :",text)

Lowercase : i am learning python programming and python is fast!!..


In [7]:
# Step-2: Remove Punctuation - !@#$%^&*()_;'.,
# re - regex - regular expression
import re

text = re.sub(r'[^\w\s]', '', text)
# remove everythin except letters and spaces
print("Without punctuation:",text)

Without punctuation: i am learning python programming and python is fast


In [8]:
# Step-3: Tokenization (Split sentence into words)
tokens = text.split()
print("Tokens :",tokens)

Tokens : ['i', 'am', 'learning', 'python', 'programming', 'and', 'python', 'is', 'fast']


In [10]:
# Step-4: Remove Stopwords - is, am, are, the, a an
stopwords = ["a","am","is","i","the","and"]
filtered_tokens = []

for word in tokens:
    if word not in stopwords:
        filtered_tokens.append(word)

print("After removing stopwords:",filtered_tokens)

After removing stopwords: ['learning', 'python', 'programming', 'python', 'fast']


In [11]:
def stem(word):
    if word.endswith("ing"):
        return word[:-3]
    return word

stemmed_words = []
for word in filtered_tokens:
    stemmed_words.append(stem(word))

print("After stemming :",stemmed_words)

After stemming : ['learn', 'python', 'programm', 'python', 'fast']


# NLP Phase-2: Word to Vector

In [12]:
# Why Vectors ?
# ML models only understand numbers

# Bag of Words (BoW)

In [13]:
# Will convert text -> count
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
sentences = [
    "i am learning python and python",
    "we are happy today",
    "i am sad today"
]

In [20]:
vectorizer = CountVectorizer()  # create object
X = vectorizer.fit_transform(sentences)
# learns vocabulary
# convert sentences to numbers

In [21]:
print("Vocabulary :",vectorizer.get_feature_names_out())
# 1. all unique words
# 2. sorted in albhabetical order
# 3. remove single letters like i, a, 

Vocabulary : ['am' 'and' 'are' 'happy' 'learning' 'python' 'sad' 'today' 'we']


In [22]:
print("Vectors:")
print(X.toarray())
# Each row is a sentence
# Each column is a word
# Each value is a count

Vectors:
[[1 1 0 0 1 2 0 0 0]
 [0 0 1 1 0 0 0 1 1]
 [1 0 0 0 0 0 1 1 0]]


# TF-IDF = Term Frequency - Inverse Document Frequency

In [23]:
# Why TF-IDF ?
# Because count is not enough
# Bag of words treats all words equally

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
sentences = [
    "i am learning python and python",
    "we are happy today",
    "i am sad today"
]

In [27]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)
# build vocabulary
# calculate TF
# calculate idf - impact of each word
# multiples - tf x idf
# produces matrix

In [28]:
print("Vocabulary :",vectorizer.get_feature_names_out())

Vocabulary : ['am' 'and' 'are' 'happy' 'learning' 'python' 'sad' 'today' 'we']


In [29]:
print("TF-IDF Matrix :")
print(X.toarray())

TF-IDF Matrix :
[[0.29651988 0.38988801 0.         0.         0.38988801 0.77977602
  0.         0.         0.        ]
 [0.         0.         0.52863461 0.52863461 0.         0.
  0.         0.40204024 0.52863461]
 [0.51785612 0.         0.         0.         0.         0.
  0.68091856 0.51785612 0.        ]]


In [30]:
# Count + Impact of each word
# TF-IDF rewards rare important words and penalizes common words

In [31]:
# TF = count(words) / total_words
# IDF = log(total_docs / documents_containing_word)