In [6]:
import nltk
import re
import numpy as np
import heapq
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

# Load stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))

# Read input text
with open("/content/sample.txt", 'r') as file:
    text = file.read()

# Sentence tokenization
sentences = sent_tokenize(text)

# Text preprocessing
cleaned_sentences = []
for sentence in sentences:
    sentence = sentence.lower()
    sentence = re.sub(r'\W', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    words = [word for word in sentence.split() if word not in stopwords]  # Remove stopwords
    cleaned_sentences.append(" ".join(words))

print("Dataset: ", cleaned_sentences)
print("Length: ", len(cleaned_sentences))

Dataset:  ['reached goal exhausted', 'even chilling euphoria thought feel upon reaching', 'something right', 'feeling five years hard work']
Length:  4


In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# Word frequency calculation
word2count = {}
for sentence in cleaned_sentences:
    words = word_tokenize(sentence)
    for word in words:
        word2count[word] = word2count.get(word, 0) + 1

# Extract top 100 frequent words
freq_words = heapq.nlargest(100, word2count, key=word2count.get)
print(freq_words)
print(word2count)

['reached', 'goal', 'exhausted', 'even', 'chilling', 'euphoria', 'thought', 'feel', 'upon', 'reaching', 'something', 'right', 'feeling', 'five', 'years', 'hard', 'work']
{'reached': 1, 'goal': 1, 'exhausted': 1, 'even': 1, 'chilling': 1, 'euphoria': 1, 'thought': 1, 'feel': 1, 'upon': 1, 'reaching': 1, 'something': 1, 'right': 1, 'feeling': 1, 'five': 1, 'years': 1, 'hard': 1, 'work': 1}


In [8]:
# Bag of Words (BoW) - Raw Counts
vectorizer = CountVectorizer(vocabulary=freq_words)
bow_matrix = vectorizer.fit_transform(cleaned_sentences)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

print("\nBag of Words (Count Occurrences):")
print(bow_df)


Bag of Words (Count Occurrences):
   reached  goal  exhausted  even  chilling  euphoria  thought  feel  upon  \
0        1     1          1     0         0         0        0     0     0   
1        0     0          0     1         1         1        1     1     1   
2        0     0          0     0         0         0        0     0     0   
3        0     0          0     0         0         0        0     0     0   

   reaching  something  right  feeling  five  years  hard  work  
0         0          0      0        0     0      0     0     0  
1         1          0      0        0     0      0     0     0  
2         0          1      1        0     0      0     0     0  
3         0          0      0        1     1      1     1     1  


In [10]:
# Bag of Words (Normalized Counts)
bow_normalized = bow_matrix.toarray() / bow_matrix.toarray().sum(axis=1, keepdims=True)
print("\nBoW (Normalized Counts):\n", bow_normalized)

# Assuming stop_words_set is a set of stopwords
stop_words_set = {'down', 'than', 'now', 'against', 'that', 'out', 'did', 'him', 'myself', 'and'}  # Example set

tfidf_vectorizer = TfidfVectorizer(stop_words=list(stop_words_set))  # Convert set to list
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_sentences)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Representation:")
print(tfidf_df)


BoW (Normalized Counts):
 [[0.33333333 0.33333333 0.33333333 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.14285714 0.14285714 0.14285714
  0.14285714 0.14285714 0.14285714 0.14285714 0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.5        0.5
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.2        0.2        0.2        0.2        0.2       ]]

TF-IDF Representation:
   chilling  euphoria      even  exhausted      feel   feeling      five  \
0  0.000000  0.000000  0.000000    0.57735  0.000000  0.000000  0.000000   
1  0.377964  0.377964  0.377964    0.00000  0.377964  0.000000  0.000000   

In [11]:
# Convert sentences to binary vectors
X = []
for sentence in cleaned_sentences:
    vector = [1 if word in word_tokenize(sentence) else 0 for word in freq_words]
    X.append(vector)

value = np.asarray(X)
print("\nBinary Word Vector Representation:\n", value)


Binary Word Vector Representation:
 [[1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1]]


In [12]:
# Tokenization for Word2Vec
tokenized_dataset = [word_tokenize(sentence) for sentence in cleaned_sentences]

# Train Word2Vec model using Skip-Gram approach
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Get embedding for a sample word
word = "learning"
if word in word2vec_model.wv:
    print(f"\nWord2Vec Embedding for '{word}':\n", word2vec_model.wv[word])
else:
    print(f"\n'{word}' not in vocabulary")

# Save the Word2Vec model
word2vec_model.save("word2vec.model")

# Get embeddings for all words
word_embeddings = {word: word2vec_model.wv[word] for word in word2vec_model.wv.index_to_key}
print("\nAll Word Embeddings:\n", word_embeddings)


'learning' not in vocabulary

All Word Embeddings:
 {'work': array([-5.3622725e-04,  2.3643136e-04,  5.1033497e-03,  9.0092728e-03,
       -9.3029495e-03, -7.1168090e-03,  6.4588725e-03,  8.9729885e-03,
       -5.0154282e-03, -3.7633716e-03,  7.3805046e-03, -1.5334714e-03,
       -4.5366134e-03,  6.5540518e-03, -4.8601604e-03, -1.8160177e-03,
        2.8765798e-03,  9.9187379e-04, -8.2852151e-03, -9.4488179e-03,
        7.3117660e-03,  5.0702621e-03,  6.7576934e-03,  7.6286553e-04,
        6.3508903e-03, -3.4053659e-03, -9.4640139e-04,  5.7685734e-03,
       -7.5216377e-03, -3.9361035e-03, -7.5115822e-03, -9.3004224e-04,
        9.5381187e-03, -7.3191668e-03, -2.3337686e-03, -1.9377411e-03,
        8.0774371e-03, -5.9308959e-03,  4.5162440e-05, -4.7537340e-03,
       -9.6035507e-03,  5.0072931e-03, -8.7595852e-03, -4.3918253e-03,
       -3.5099984e-05, -2.9618145e-04, -7.6612402e-03,  9.6147433e-03,
        4.9820580e-03,  9.2331432e-03, -8.1579173e-03,  4.4957981e-03,
       -4.13707