In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Saurabh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Sample tweets list
tweets = [
    "I'm getting on Borderlands and I will murder you all!",
    "Borderlands is such a fun game! I love it!",
    "Murder is bad, but this game is good 😄",
    "I’m logging off now, see you all tomorrow!"
]

In [3]:
from nltk.tokenize import word_tokenize

# Lowercase and tokenize
tokenized_tweets = [word_tokenize(tweet.lower()) for tweet in tweets]
print("Tokenized Tweets:\n", tokenized_tweets)

Tokenized Tweets:
 [['i', "'m", 'getting', 'on', 'borderlands', 'and', 'i', 'will', 'murder', 'you', 'all', '!'], ['borderlands', 'is', 'such', 'a', 'fun', 'game', '!', 'i', 'love', 'it', '!'], ['murder', 'is', 'bad', ',', 'but', 'this', 'game', 'is', 'good', '😄'], ['i', '’', 'm', 'logging', 'off', 'now', ',', 'see', 'you', 'all', 'tomorrow', '!']]


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Raw count
vectorizer = CountVectorizer()
bow_counts = vectorizer.fit_transform(tweets)
print("Vocabulary:\n", vectorizer.get_feature_names_out())
print("\nCount Matrix:\n", bow_counts.toarray())

# Normalized count (Term Frequency)
import pandas as pd
tf_matrix = bow_counts.toarray() / bow_counts.toarray().sum(axis=1, keepdims=True)
tf_df = pd.DataFrame(tf_matrix, columns=vectorizer.get_feature_names_out())
print("\nNormalized Term Frequency Matrix:\n", tf_df)

Vocabulary:
 ['all' 'and' 'bad' 'borderlands' 'but' 'fun' 'game' 'getting' 'good' 'is'
 'it' 'logging' 'love' 'murder' 'now' 'off' 'on' 'see' 'such' 'this'
 'tomorrow' 'will' 'you']

Count Matrix:
 [[1 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1]
 [0 0 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0]
 [0 0 1 0 1 0 1 0 1 2 0 0 0 1 0 0 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 1 0 1]]

Normalized Term Frequency Matrix:
         all    and    bad  borderlands    but       fun      game  getting  \
0  0.125000  0.125  0.000     0.125000  0.000  0.000000  0.000000    0.125   
1  0.000000  0.000  0.000     0.142857  0.000  0.142857  0.142857    0.000   
2  0.000000  0.000  0.125     0.000000  0.125  0.000000  0.125000    0.000   
3  0.142857  0.000  0.000     0.000000  0.000  0.000000  0.000000    0.000   

    good        is  ...  murder       now       off     on       see  \
0  0.000  0.000000  ...   0.125  0.000000  0.000000  0.125  0.000000   
1  0.000  0.142857  ...   0.000  0

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(tweets)
print("TF-IDF Vocabulary:\n", tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:\n", tfidf_matrix.toarray())

TF-IDF Vocabulary:
 ['all' 'and' 'bad' 'borderlands' 'but' 'fun' 'game' 'getting' 'good' 'is'
 'it' 'logging' 'love' 'murder' 'now' 'off' 'on' 'see' 'such' 'this'
 'tomorrow' 'will' 'you']

TF-IDF Matrix:
 [[0.30956515 0.39264414 0.         0.30956515 0.         0.
  0.         0.39264414 0.         0.         0.         0.
  0.         0.30956515 0.         0.         0.39264414 0.
  0.         0.         0.         0.39264414 0.30956515]
 [0.         0.         0.         0.32555709 0.         0.41292788
  0.32555709 0.         0.         0.32555709 0.41292788 0.
  0.41292788 0.         0.         0.         0.         0.
  0.41292788 0.         0.         0.         0.        ]
 [0.         0.         0.35968533 0.         0.35968533 0.
  0.28358005 0.         0.35968533 0.56716009 0.         0.
  0.         0.28358005 0.         0.         0.         0.
  0.         0.35968533 0.         0.         0.        ]
 [0.31553666 0.         0.         0.         0.         0.
  0.        

In [6]:
from gensim.models import Word2Vec

# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_tweets, vector_size=50, window=5, min_count=1, workers=4)

# Check embedding for a word
word = "game"
if word in w2v_model.wv:
    print(f"Word2Vec embedding for '{word}':\n", w2v_model.wv[word])
else:
    print(f"'{word}' not in vocabulary")

Word2Vec embedding for 'game':
 [-0.01427302  0.00248307 -0.0143419  -0.00448909  0.00743625  0.01166134
  0.00240047  0.00420798 -0.00822306  0.01444267 -0.01261127  0.00929099
 -0.01643045  0.00406776 -0.00995342 -0.0084942  -0.00621273  0.01130777
  0.01159351 -0.00995915  0.00154445 -0.01698569  0.01562924  0.01850968
 -0.00549363  0.00160748  0.00149222  0.01095682 -0.01721063  0.00117423
  0.01373279  0.00446473  0.00225383 -0.01864095  0.01696353 -0.01252604
 -0.00598735  0.00698181 -0.00155383  0.00282455  0.00357028 -0.01365527
 -0.01944894  0.01808634  0.01240194 -0.01383419  0.00680082  0.00040813
  0.00950675 -0.01423853]
