In [5]:
# === Text Representation Techniques Demo ===

# Sample corpus
corpus = [
    "The cat sat on the mat",
    "The dog barked at the cat",
    "Dogs and cats are friends"
]

# 1. One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder
import numpy as np

vocab = list(set(" ".join(corpus).lower().split()))
onehot = OneHotEncoder(sparse_output=False)
onehot.fit(np.array(vocab).reshape(-1,1))

print("1. One-hot vector for 'cat':")
print(onehot.transform([["cat"]]))


# 2. Bag-of-Words
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer()
X_bow = bow.fit_transform(corpus)
print("\n2. Bag-of-Words representation (doc x vocab):")
print(X_bow.toarray())
print("Vocabulary:", bow.get_feature_names_out())


# 3. TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(corpus)
print("\n3. TF-IDF representation:")
print(X_tfidf.toarray())
print("Vocabulary:", tfidf.get_feature_names_out())


# 4. Word2Vec (using gensim)
from gensim.models import Word2Vec
sentences = [doc.lower().split() for doc in corpus]
w2v = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=1)
print("\n4. Word2Vec embedding for 'cat':")
print(w2v.wv['cat'][:10])  # show first 10 dims


# 5. GloVe (via gensim downloader)
import gensim.downloader as api
glove = api.load("glove-wiki-gigaword-50")
print("\n5. GloVe embedding for 'cat':")
print(glove['cat'][:10])


# 6. FastText
from gensim.models.fasttext import FastText
ft = FastText(sentences, vector_size=50, window=3, min_count=1)
print("\n6. FastText embedding for 'cats':")
print(ft.wv['cats'][:10])  # works for plural form


# 7. ELMo (via allennlp)
# from allennlp.commands.elmo import ElmoEmbedder
# elmo = ElmoEmbedder()
# tokens = ["the", "cat", "sat"]
# embeddings = elmo.embed_sentence(tokens)
# print("\n7. ELMo embedding for 'cat' (dim):", embeddings.shape)


# 8. BERT
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
inputs = tokenizer("The cat sat on the mat", return_tensors="pt")
outputs = model(**inputs)
print("\n8. BERT embedding shape:", outputs.last_hidden_state.shape)


# 9. Sentence-BERT (sentence embeddings)
from sentence_transformers import SentenceTransformer
sbert = SentenceTransformer("all-MiniLM-L6-v2")
sent_emb = sbert.encode("The cat sat on the mat")
print("\n9. Sentence-BERT embedding (dim):", sent_emb.shape)


# 10. GPT-like embeddings (OpenAI or HuggingFace models)
from transformers import AutoTokenizer, AutoModel
tok = AutoTokenizer.from_pretrained("gpt2")
mdl = AutoModel.from_pretrained("gpt2")
inp = tok("The cat sat on the mat", return_tensors="pt")
out = mdl(**inp)
print("\n10. GPT2 embedding shape:", out.last_hidden_state.shape)


1. One-hot vector for 'cat':
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]

2. Bag-of-Words representation (doc x vocab):
[[0 0 0 0 1 0 0 0 0 1 1 1 2]
 [0 0 1 1 1 0 1 0 0 0 0 0 2]
 [1 1 0 0 0 1 0 1 1 0 0 0 0]]
Vocabulary: ['and' 'are' 'at' 'barked' 'cat' 'cats' 'dog' 'dogs' 'friends' 'mat' 'on'
 'sat' 'the']

3. TF-IDF representation:
[[0.         0.         0.         0.         0.31331607 0.
  0.         0.         0.         0.41197298 0.41197298 0.41197298
  0.62663214]
 [0.         0.         0.41197298 0.41197298 0.31331607 0.
  0.41197298 0.         0.         0.         0.         0.
  0.62663214]
 [0.4472136  0.4472136  0.         0.         0.         0.4472136
  0.         0.4472136  0.4472136  0.         0.         0.
  0.        ]]
Vocabulary: ['and' 'are' 'at' 'barked' 'cat' 'cats' 'dog' 'dogs' 'friends' 'mat' 'on'
 'sat' 'the']

4. Word2Vec embedding for 'cat':
[-0.01631583  0.0089916  -0.00827415  0.00164907  0.01699724 -0.00892435
  0.009035   -0.01357392 -0.00709698  0.0

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`



9. Sentence-BERT embedding (dim): (384,)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`



10. GPT2 embedding shape: torch.Size([1, 6, 768])
