1

In [None]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.tokenize import WhitespaceTokenizer

sentence = "The quick brown fox jumps over the lazy dog."

whitespace_tokenizer = WhitespaceTokenizer()
whitespace_tokens = whitespace_tokenizer.tokenize(sentence)

print("Whitespace Tokenization:")
print(whitespace_tokens)

Whitespace Tokenization:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.']


In [None]:
from nltk.tokenize import WordPunctTokenizer

punct_tokenizer = WordPunctTokenizer()
punct_tokens = punct_tokenizer.tokenize(sentence)

print("Punctuation-Based Tokenization:")
print(punct_tokens)

Punctuation-Based Tokenization:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [None]:
from nltk.tokenize import TreebankWordTokenizer

treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(sentence)

print("Treebank Tokenization:")
print(treebank_tokens)

Treebank Tokenization:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [None]:
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(sentence)

print("Tweet Tokenization:")
print(tweet_tokens)


Tweet Tokenization:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [None]:
from nltk.tokenize import MWETokenizer

mwe_tokenizer = MWETokenizer()
mwe_tokens = mwe_tokenizer.tokenize(sentence.split())

print("MWE Tokenization:")
print(mwe_tokens)

MWE Tokenization:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.']


In [None]:
from nltk.stem import PorterStemmer, SnowballStemmer

stemmers = [PorterStemmer(), SnowballStemmer("english")]

print("Stemming:")
for stemmer in stemmers:
    stemmed_words = [stemmer.stem(word) for word in treebank_tokens]
    print(stemmer.__class__.__name__ + ":")
    print(stemmed_words)


Stemming:
PorterStemmer:
['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']
SnowballStemmer:
['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']


In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print("Lemmatization:")
lemmatized_words = [lemmatizer.lemmatize(word) for word in treebank_tokens]
print(lemmatized_words)


Lemmatization:
['The', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '.']


2

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec


In [None]:
# Sample data
data = [
    "The quick brown fox jumps over the lazy dog.",
    "The dog barks loudly.",
    "The cat sleeps lazily.",
    "The fox and the dog are friends."
]


In [None]:
# Bag-of-Words Approach - Count Occurrence:
# Tokenize the data
tokenized_data = [word_tokenize(sentence) for sentence in data]

# Initialize the CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the tokenized data
count_occurrence = count_vectorizer.fit_transform([" ".join(tokens) for tokens in tokenized_data])

# Get the feature names (words)
feature_names = count_vectorizer.get_feature_names_out()

# Print the count occurrence matrix
print("Bag-of-Words - Count Occurrence:")
print(count_occurrence.toarray())


Bag-of-Words - Count Occurrence:
[[0 0 0 1 0 1 1 0 1 0 1 0 1 1 0 2]
 [0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 1]
 [1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 2]]


In [None]:
# Calculate the normalized count occurrence
normalized_count_occurrence = count_occurrence / count_occurrence.sum(axis=1)

# Print the normalized count occurrence matrix
print("Bag-of-Words - Normalized Count Occurrence:")
print(normalized_count_occurrence)


Bag-of-Words - Normalized Count Occurrence:
[[0.         0.         0.         0.11111111 0.         0.11111111
  0.11111111 0.         0.11111111 0.         0.11111111 0.
  0.11111111 0.11111111 0.         0.22222222]
 [0.         0.         0.25       0.         0.         0.25
  0.         0.         0.         0.         0.         0.25
  0.         0.         0.         0.25      ]
 [0.         0.         0.         0.         0.25       0.
  0.         0.         0.         0.25       0.         0.
  0.         0.         0.25       0.25      ]
 [0.14285714 0.14285714 0.         0.         0.         0.14285714
  0.14285714 0.14285714 0.         0.         0.         0.
  0.         0.         0.         0.28571429]]


In [None]:
# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the tokenized data
tfidf = tfidf_vectorizer.fit_transform([" ".join(tokens) for tokens in tokenized_data])

# Print the TF-IDF matrix
print("TF-IDF:")
print(tfidf.toarray())


TF-IDF:
[[0.         0.         0.         0.37481119 0.         0.23923713
  0.29550545 0.         0.37481119 0.         0.37481119 0.
  0.37481119 0.37481119 0.         0.39118406]
 [0.         0.         0.61087812 0.         0.         0.38991559
  0.         0.         0.         0.         0.         0.61087812
  0.         0.         0.         0.31878155]
 [0.         0.         0.         0.         0.55280532 0.
  0.         0.         0.         0.55280532 0.         0.
  0.         0.         0.55280532 0.28847675]
 [0.44201611 0.44201611 0.         0.         0.         0.28213316
  0.34849058 0.44201611 0.         0.         0.         0.
  0.         0.         0.         0.46132469]]


In [None]:
# Train the Word2Vec model
word2vec_model = Word2Vec(tokenized_data, size=100, window=5, min_count=1)

# Get the word embeddings
word_embeddings = word2vec_model.wv

# Print the word embeddings for each word
print("Word Embeddings:")
for word in feature_names:
    print(word, ":", word_embeddings[word])
