In [1]:
import numpy as np
import gensim.downloader as api
import inflect
import re

In [None]:
# === CONFIG ===
MAX_LEN = 32
EMBED_DIM = 300
MAX_SENTENCES = 2000
INPUT_FILE = "data/Sentences_AllAgree.txt"
OUTPUT_BIN = f"data/embeddings{MAX_SENTENCES/1000}k.bin"

# === INIT ===
inf = inflect.engine()
NUM_VEC = np.random.normal(scale=0.1, size=(EMBED_DIM,)).astype(np.float32)

In [5]:
print("Loading Word2Vec model (Google News 300)...")
model = api.load("word2vec-google-news-300") 

# === Load Sentences ===
print(f" Reading up to {MAX_SENTENCES} sentences from: {INPUT_FILE}")
sentences = []
with open(INPUT_FILE, encoding='latin1') as f:
    for line in f:
        if '@' in line:
            text, _ = line.strip().rsplit('@', 1)
            sentences.append(text.strip())
        if len(sentences) >= MAX_SENTENCES:
            break

print(f" Loaded {len(sentences)} sentences.")

# === Tokenization with Number-to-Word Expansion ===
def number_to_words(tok):
    if '.' in tok:
        parts = tok.split('.')
        return [inf.number_to_words(parts[0])] + ['point'] + [inf.number_to_words(d) for d in parts[1]]
    else:
        return [inf.number_to_words(tok)]

def tokenize_numberized(sentence):
    tokens = re.findall(r'\b\d+\.\d+|\d+|\w+\b', sentence.lower())
    result = []
    for tok in tokens:
        if re.fullmatch(r'\d+\.\d+|\d+', tok):
            result.extend(number_to_words(tok))
        else:
            result.append(tok)
    return result

# === Sentence to Matrix ===
def sentence_to_matrix(sentence):
    tokens = tokenize_numberized(sentence)[:MAX_LEN]
    vectors = []
    for token in tokens:
        if token in model:
            vectors.append(model[token])
        elif token.isdigit() or token in ['point', 'percent', 'million', 'billion']:
            vectors.append(NUM_VEC)
        else:
            vectors.append(np.zeros(EMBED_DIM, dtype=np.float32))
    while len(vectors) < MAX_LEN:
        vectors.append(np.zeros(EMBED_DIM, dtype=np.float32))
    return np.stack(vectors)

# === Embed All Sentences ===
print(" Generating embeddings...")
all_embeddings = np.stack([sentence_to_matrix(s) for s in sentences])
print(f" Final shape: {all_embeddings.shape} (sentences × tokens × dim)")

# === Save to .bin ===
print(f" Saving to: {OUTPUT_BIN}")
all_embeddings.astype(np.float32).tofile(OUTPUT_BIN)

Loading Word2Vec model (Google News 300)...
 Reading up to 2000 sentences from: D:\ML_DL\Custom-CUDA-self-attention\Sentences_AllAgree.txt
 Loaded 2000 sentences.
 Generating embeddings...
 Final shape: (2000, 32, 300) (sentences × tokens × dim)
 Saving to: embeddings.bin
