Creating **Word2Vec from scratch (without using `gensim`)** is absolutely possible and a great learning exercise to understand how word embeddings work under the hood.

Hereâ€™s a simple, conceptual **implementation of Skip-Gram Word2Vec** from scratch using just **NumPy**, trained on your own text data.

---

## ðŸ§  Build Word2Vec (Skip-Gram) from Scratch using NumPy

---

### ðŸ”¹ 1. Sample Dataset (Your Own Sentences)

```python
corpus = [
    "I love this phone",
    "This camera is amazing",
    "Battery life is great",
    "The phone has excellent battery",
    "I love the camera and battery",
    "This phone is awful",
    "I hate this battery",
    "The camera is bad"
]
```

---

### ðŸ”¹ 2. Preprocess: Tokenization & Vocabulary

```python
import numpy as np
from collections import defaultdict
import re

# Clean and tokenize
def tokenize(sentences):
    tokenized = []
    for sent in sentences:
        words = re.findall(r'\b\w+\b', sent.lower())
        tokenized.append(words)
    return tokenized

tokenized_corpus = tokenize(corpus)

# Build vocabulary
word2idx = {}
idx2word = {}
vocab = set([word for sent in tokenized_corpus for word in sent])
for i, word in enumerate(vocab):
    word2idx[word] = i
    idx2word[i] = word

vocab_size = len(vocab)
print("Vocabulary:", word2idx)
```

---

### ðŸ”¹ 3. Generate Skip-Gram Pairs

```python
def generate_skip_grams(tokenized, window_size=2):
    pairs = []
    for sent in tokenized:
        for idx, word in enumerate(sent):
            for offset in range(-window_size, window_size+1):
                context_idx = idx + offset
                if context_idx < 0 or context_idx >= len(sent) or offset == 0:
                    continue
                pairs.append((word, sent[context_idx]))
    return pairs

skip_gram_pairs = generate_skip_grams(tokenized_corpus)
```

---

### ðŸ”¹ 4. One-Hot Encoding

```python
def one_hot_vector(word, vocab_size, word2idx):
    vec = np.zeros(vocab_size)
    vec[word2idx[word]] = 1
    return vec
```

---

### ðŸ”¹ 5. Build the Word2Vec Model (Forward + Backward Pass)

```python
# Initialize weights
embedding_dim = 10
W1 = np.random.rand(vocab_size, embedding_dim)  # Input -> Hidden
W2 = np.random.rand(embedding_dim, vocab_size)  # Hidden -> Output

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

# Training
learning_rate = 0.01
epochs = 1000

for epoch in range(epochs):
    loss = 0
    for center, context in skip_gram_pairs:
        x = one_hot_vector(center, vocab_size, word2idx)  # input vector
        y_true = one_hot_vector(context, vocab_size, word2idx)  # target

        # Forward
        h = np.dot(W1.T, x)
        u = np.dot(W2.T, h)
        y_pred = softmax(u)

        # Loss (cross-entropy)
        loss -= np.log(y_pred[np.argmax(y_true)])

        # Backprop
        e = y_pred - y_true
        dW2 = np.outer(h, e)
        dW1 = np.outer(x, np.dot(W2, e))

        # Update weights
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")
```

---

### ðŸ”¹ 6. Get Word Vectors

```python
def get_embedding(word):
    return W1[word2idx[word]]

print("Vector for 'battery':", get_embedding("battery"))
```

---

### ðŸ”¹ 7. Find Similar Words (Cosine Similarity)

```python
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def most_similar(word, top_n=3):
    target_vec = get_embedding(word)
    similarities = {}
    for other in vocab:
        if other == word:
            continue
        sim = cosine_similarity(target_vec, get_embedding(other))
        similarities[other] = sim
    sorted_sim = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_sim[:top_n]

print("Most similar to 'battery':", most_similar("battery"))
```

---

## âœ… Summary

| Step           | Description                          |
| -------------- | ------------------------------------ |
| Tokenize       | Clean and split your own data        |
| Generate Pairs | Use Skip-Gram technique              |
| Model          | Simple 1-hidden-layer NN using NumPy |
| Train          | Use forward and backward pass        |
| Use            | Get word vectors, find similar words |

---

Would you like this code in a downloadable Jupyter Notebook format?


Perfect! Let's now build a **Word2Vec model from scratch using an Artificial Neural Network (ANN)** â€” still using **only NumPy**, no libraries like `gensim` or `TensorFlow`. This ANN-based model is a **Skip-Gram Word2Vec**, where:

* **Input**: One-hot vector for center word
* **Hidden Layer**: Learns word embeddings
* **Output**: Predicts context words via softmax

---

## ðŸ§  Word2Vec Using ANN (from Scratch with NumPy)

---

### ðŸ”¹ Step 1: Dataset & Tokenization

```python
import numpy as np
import re

# Sample data (you can replace this with your own sentences)
corpus = [
    "I love this phone",
    "This camera is amazing",
    "Battery life is great",
    "I love the camera and battery",
    "This phone is awful",
    "I hate this battery",
    "The camera is bad"
]

# Clean & tokenize
def tokenize(corpus):
    tokenized = []
    for sent in corpus:
        words = re.findall(r'\b\w+\b', sent.lower())
        tokenized.append(words)
    return tokenized

tokenized_corpus = tokenize(corpus)
```

---

### ðŸ”¹ Step 2: Vocabulary & Encoding

```python
# Vocabulary
vocab = sorted(set(word for sent in tokenized_corpus for word in sent))
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)
```

---

### ðŸ”¹ Step 3: Generate Skip-Gram Training Data

```python
def generate_training_data(tokenized, window_size=2):
    training_data = []
    for sentence in tokenized:
        for idx, word in enumerate(sentence):
            for offset in range(-window_size, window_size + 1):
                context_idx = idx + offset
                if context_idx < 0 or context_idx >= len(sentence) or offset == 0:
                    continue
                center = word2idx[word]
                context = word2idx[sentence[context_idx]]
                training_data.append((center, context))
    return training_data

training_pairs = generate_training_data(tokenized_corpus)
```

---

### ðŸ”¹ Step 4: One-Hot Encoding

```python
def one_hot(index, size):
    vec = np.zeros(size)
    vec[index] = 1
    return vec
```

---

### ðŸ”¹ Step 5: ANN Model (1 hidden layer)

```python
# Model params
embedding_dim = 10
learning_rate = 0.01
epochs = 1000

# Initialize weights
W1 = np.random.rand(vocab_size, embedding_dim)  # input -> hidden
W2 = np.random.rand(embedding_dim, vocab_size)  # hidden -> output

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()
```

---

### ðŸ”¹ Step 6: Training Loop (Forward + Backpropagation)

```python
for epoch in range(epochs):
    total_loss = 0
    for center_idx, context_idx in training_pairs:
        x = one_hot(center_idx, vocab_size)
        y_true = one_hot(context_idx, vocab_size)

        # Forward pass
        h = np.dot(W1.T, x)              # hidden layer
        u = np.dot(W2.T, h)              # output layer
        y_pred = softmax(u)

        # Loss (cross-entropy)
        loss = -np.log(y_pred[context_idx])
        total_loss += loss

        # Backpropagation
        e = y_pred - y_true
        dW2 = np.outer(h, e)
        dW1 = np.outer(x, np.dot(W2, e))

        # Update weights
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")
```

---

### ðŸ”¹ Step 7: Word Embeddings & Similarity

```python
def get_embedding(word):
    return W1[word2idx[word]]

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def most_similar(word, top_n=3):
    target_vec = get_embedding(word)
    sims = {}
    for other in vocab:
        if other == word:
            continue
        sim = cosine_similarity(target_vec, get_embedding(other))
        sims[other] = sim
    return sorted(sims.items(), key=lambda x: x[1], reverse=True)[:top_n]

print("Most similar to 'battery':", most_similar("battery"))
```

---

## âœ… Summary

| Component | Description                          |
| --------- | ------------------------------------ |
| Input     | One-hot encoded center word          |
| Model     | ANN with one hidden layer            |
| Output    | Softmax over context word prediction |
| Embedding | Learned in weights `W1`              |
| Use       | Get vector, find similar words       |

---

Would you like this in a Jupyter Notebook or with plots for embedding visualization (e.g., t-SNE)?
