### Bag Of Words

In [87]:
#Initialize vectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

vectorizer = CountVectorizer(max_features = 100)

example_texts = ["It's actually the best scientific Biopic made after The theory of everything!!",
                "Overall PERFECT , PRECISE WITH B/W REFERENCE, OPPENHEIMER IS A MUST WATCH'ONLY IN IMAX FORMAT IF POSSIBLE AS IT GIVES A BIG STADIUM TYPE EXPERIENCE."]
vectorizer.fit(example_texts)

# Top-10 words
vectorizer.get_feature_names()

['actually',
 'after',
 'as',
 'best',
 'big',
 'biopic',
 'everything',
 'experience',
 'format',
 'gives',
 'if',
 'imax',
 'in',
 'is',
 'it',
 'made',
 'must',
 'of',
 'only',
 'oppenheimer',
 'overall',
 'perfect',
 'possible',
 'precise',
 'reference',
 'scientific',
 'stadium',
 'the',
 'theory',
 'type',
 'watch',
 'with']

In [88]:
vectorized_example_texts = vectorizer.transform(example_texts)

data = pd.DataFrame(vectorized_example_texts.toarray())
data.columns = vectorizer.get_feature_names()
data

Unnamed: 0,actually,after,as,best,big,biopic,everything,experience,format,gives,...,possible,precise,reference,scientific,stadium,the,theory,type,watch,with
0,1,1,0,1,0,1,1,0,0,0,...,0,0,0,1,0,2,1,0,0,0
1,0,0,1,0,1,0,0,1,1,1,...,1,1,1,0,1,0,0,1,1,1


### TF-IDF (Term Frequency – Inverse Document Frequency)

In [89]:
#calculate tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
# Fit TF-IDF on train texts
vectorizer = TfidfVectorizer(max_features = 200, norm = None) # take top 200 words
vectorizer.fit(example_texts)

# Top-10 words
vectorizer.get_feature_names_out()[:40]

array(['actually', 'after', 'as', 'best', 'big', 'biopic', 'everything',
       'experience', 'format', 'gives', 'if', 'imax', 'in', 'is', 'it',
       'made', 'must', 'of', 'only', 'oppenheimer', 'overall', 'perfect',
       'possible', 'precise', 'reference', 'scientific', 'stadium', 'the',
       'theory', 'type', 'watch', 'with'], dtype=object)

In [18]:
vectorized_example_texts_tf_idf = vectorizer.transform(example_texts)

data = pd.DataFrame(vectorized_example_texts_tf_idf.toarray())
data.columns = vectorizer.get_feature_names_out()
data

Unnamed: 0,actually,after,as,best,big,biopic,everything,experience,format,gives,...,possible,precise,reference,scientific,stadium,the,theory,type,watch,with
0,1.405465,1.405465,0.0,1.405465,0.0,1.405465,1.405465,0.0,0.0,0.0,...,0.0,0.0,0.0,1.405465,0.0,2.81093,1.405465,0.0,0.0,0.0
1,0.0,0.0,1.405465,0.0,1.405465,0.0,0.0,1.405465,1.405465,1.405465,...,1.405465,1.405465,1.405465,0.0,1.405465,0.0,0.0,1.405465,1.405465,1.405465


### Preprocessing

In [90]:
import string # library for working with strings
import nltk   # Natural Language Toolkit
import numpy as np # Library for numerical, matrix computations

In [91]:
# download the list of stopwords for English
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

# example of stop words
stop_words[:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lakhty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [92]:
# punctuation
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [93]:
# We initialize the WordPunctTokenizer, with which we will then split the text into words.
word_tokenizer = nltk.WordPunctTokenizer()

In [94]:
dates = [str(x) for x in np.arange(1900, 2022)]
def preprocess_data(data):
    texts = []
    targets = []
    
    # iterate over reviews
    for item in data:
               
        text_lower = item.lower() # lowercase texts
        tokens     = word_tokenizer.tokenize(text_lower) #split text into words
        
        # remove punctuation and stop words
        tokens = [word for word in tokens if (word not in string.punctuation and word not in stop_words and word not in dates)]
        texts.append(tokens) # add into the list
    
    return texts

In [95]:
texts = preprocess_data(example_texts)

In [96]:
print("Tokens: ", texts[0])

Tokens:  ['actually', 'best', 'scientific', 'biopic', 'made', 'theory', 'everything', '!!']


In [97]:
# Punctuation removal - regex
import re

clean_text = re.sub(r'!+', '', example_texts[0])
clean_text  

"It's actually the best scientific Biopic made after The theory of everything"

In [98]:
# example of feature for the sentiment classification task
def count_word_occurrences(text, word):
    word_count = text.count(word)
    return word_count

word = "!"

result = count_word_occurrences(example_texts[0], word)
print(f"The word '{word}' appears {result} times in the text.")

The word '!' appears 2 times in the text.


In [99]:
# stemming

from nltk.stem.snowball import SnowballStemmer 

# initialize stemmer
stemmer = SnowballStemmer("english")

In [100]:
# examples of stemming using NLTK
for word in clean_text.split():
    word_stem = stemmer.stem(word)
    print("Before: %s, After: %s" % (word, word_stem))

Before: It's, After: it
Before: actually, After: actual
Before: the, After: the
Before: best, After: best
Before: scientific, After: scientif
Before: Biopic, After: biopic
Before: made, After: made
Before: after, After: after
Before: The, After: the
Before: theory, After: theori
Before: of, After: of
Before: everything, After: everyth


In [101]:
# examples of lemmatization using NLTK
# import these modules
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

for word in clean_text.split():
    word_norm = lemmatizer.lemmatize(word)
    print("Before: %s.\t\t\tAfter: %s" % (word, word_norm))

Before: It's.			After: It's
Before: actually.			After: actually
Before: the.			After: the
Before: best.			After: best
Before: scientific.			After: scientific
Before: Biopic.			After: Biopic
Before: made.			After: made
Before: after.			After: after
Before: The.			After: The
Before: theory.			After: theory
Before: of.			After: of
Before: everything.			After: everything


In [76]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/lakhty/nltk_data...


True

### Last but Not Least ... spaCy 🤗 

In [69]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp(example_texts[0])
entities = [(ent.text, ent.pos_, ent.lemma_) for ent in doc]

for entity, pos, lemma in entities:
    print(f"Entity: {entity} \nPOS: {pos}\nLemma: {lemma}")
    print ()

Entity: It 
POS: PRON
Lemma: it

Entity: 's 
POS: AUX
Lemma: be

Entity: actually 
POS: ADV
Lemma: actually

Entity: the 
POS: DET
Lemma: the

Entity: best 
POS: ADV
Lemma: well

Entity: scientific 
POS: ADJ
Lemma: scientific

Entity: Biopic 
POS: PROPN
Lemma: Biopic

Entity: made 
POS: VERB
Lemma: make

Entity: after 
POS: ADP
Lemma: after

Entity: The 
POS: DET
Lemma: the

Entity: theory 
POS: NOUN
Lemma: theory

Entity: of 
POS: ADP
Lemma: of

Entity: everything 
POS: PRON
Lemma: everything

Entity: ! 
POS: PUNCT
Lemma: !

Entity: ! 
POS: PUNCT
Lemma: !



In [66]:
len(nlp.vocab)

776

### ✨ Bonus materials ✨

**Useful NLP sources**:

- spaCy documentation: https://spacy.io/
- excellent article on Medium about NLP: https://medium.com/@ageitgey/natural-language-processing-is-fun-9a0bff37854e
- NLP overview: https://tfduque.medium.com/dissecting-natural-language-processing-layer-by-layer-an-introductory-overview-d11cfff4f329

**Advanced**:
- NLP Newsletter by Sebastian Ruder: https://www.ruder.io/nlp-news/
- The Batch: https://read.deeplearning.ai/the-batch/
- MLOps: https://mlops.substack.com/
- Data Is Plural: https://www.data-is-plural.com/
- Reddit: https://www.reddit.com/r/MachineLearning/ https://www.reddit.com/r/LanguageTechnology/
- NLP Newsletter by Elvis https://substack.com/@elvissaravia - https://dair.ai/newsletter/
- Import AI https://jack-clark.net/
- The Gradient https://thegradient.pub/
- The Medical Futurist https://medicalfuturist.com/


**Twitter sources**:

@omarsar0
@svpino
@davisblalock
@mervenoyann
@jeremyphoward
@paperswithcode