In [1]:
import pandas as pd

# Sample dataset
data = {'text': ["I love programming!", "NLP is fascinating.", "Data science is the future."]}
df = pd.DataFrame(data)

print(df)

                          text
0          I love programming!
1          NLP is fascinating.
2  Data science is the future.


In [2]:
# Sample dataset with duplicates
data = {'text': ["I love programming!", "I love programming!", "NLP is fascinating."]}
df = pd.DataFrame(data)

# Remove duplicates
df.drop_duplicates(inplace=True)

print(df)

                  text
0  I love programming!
2  NLP is fascinating.


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = ["I love NLP", "NLP is fun"]

# Create BoW model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

print(X.toarray())  # Display the BoW matrix

[[0 0 1 1]
 [1 1 0 1]]


In [4]:
def get_bag_of_words(doc, vocab):
    word_count_dict = dict.fromkeys(vocab, 0)
    for word in doc.split():
        word_count_dict[word] += 1
    
    return list(word_count_dict.values())

# Example usage
vocab = ['I', 'love', 'NLP']
doc = 'I love NLP'
print(get_bag_of_words(doc, vocab))

[1, 1, 1]


In [7]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words = ["running", "ran", "easily", "fairly"]
stems = [stemmer.stem(word) for word in words]

print(stems)

['run', 'ran', 'easili', 'fairli']


In [12]:
text = "This is an example."
tokens = word_tokenize(text)

print(tokens)

['This', 'is', 'an', 'example', '.']


In [13]:
from nltk import ngrams

text = "I love NLP"
n_grams = list(ngrams(text.split(), 2))  # Bigrams

print(n_grams)

[('I', 'love'), ('love', 'NLP')]


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = ["I love programming", "Programming is fun"]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

print(tfidf_matrix.toarray())

[[0.         0.         0.81480247 0.57973867]
 [0.6316672  0.6316672  0.         0.44943642]]


In [17]:
from gensim.models import Word2Vec

sentences = [["I", "love", "NLP"], ["NLP", "is", "fun"]]
model = Word2Vec(sentences, min_count=1)

# Get vector for a word
vector = model.wv['NLP']
print(vector)

[-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-03
 -1.5080082e-03  2.46979

In [18]:
import numpy as np

def tfidf_weighted_avg_word2vec(words, model, tfidf):
    return np.mean([model.wv[word] * tfidf[word] for word in words if word in model.wv], axis=0)

# Example usage with dummy TF-IDF weights
words = ['I', 'love', 'NLP']
tfidf_weights = {'I': 0.5, 'love': 0.7, 'NLP': 0.9}
avg_vector = tfidf_weighted_avg_word2vec(words, model, tfidf_weights)

print(avg_vector)

[-3.27399559e-03  2.44761701e-03  2.88764626e-04  1.86964741e-03
 -1.09682046e-03 -2.11842917e-03  2.77741882e-03  4.66167694e-03
 -7.74427608e-04 -1.67730788e-03  3.35219293e-03  1.40458310e-03
 -1.80555135e-03  8.48808966e-04 -3.13561555e-04 -1.75445841e-03
  2.40402040e-03 -1.10370456e-05 -3.41602881e-03 -2.07135640e-03
  2.71236338e-03 -4.07869782e-04  5.54878265e-03  3.98697425e-03
 -8.32398597e-04 -3.03685811e-04  1.27708307e-03  3.54712573e-03
 -3.21885385e-03 -9.82971396e-04 -9.51390248e-04 -7.98562018e-04
  1.38282310e-03 -4.23684577e-03  1.62917329e-03  4.32349130e-04
  4.08497034e-03 -2.59135594e-03 -2.30917917e-03  1.08744961e-03
 -1.78378320e-03  1.56946538e-03 -2.78279674e-03 -4.65111254e-04
  2.73110997e-03 -5.80612279e-04 -1.06122391e-03  2.36334745e-03
  1.55698333e-03  1.03025278e-03 -7.78380781e-04  2.09105853e-03
 -1.81789289e-03 -2.75355601e-03  8.83463479e-04 -2.58820620e-03
  2.05008851e-04 -1.34867185e-03  6.80093654e-05  7.34508329e-04
  3.99932847e-04  2.27424