In [1]:
# ======================================
# ✅ WORKING SOLUTION: spaCy-only Tokenization (No torchtext needed)
# ======================================

# STEP 1: INSTALL ONLY SPACY (avoid torchtext dependency conflicts)
!pip install -U spacy -q
!python -m spacy download en_core_web_sm

# 🔁 After installation, restart runtime and run the code below

# ======================================
# MAIN SOLUTION: Using spaCy directly
# ======================================

import spacy

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Define sample text data
lines = [
    "IBM taught me tokenization",
    "Special tokenizers are ready and they will blow your mind"
]

# Tokenize using spaCy directly
tokens_alt = []
max_length_alt = 0

for line in lines:
    doc = nlp(line)
    tokenized_line = [token.text for token in doc]  # Extract token text
    tokenized_line = ['<bos>'] + tokenized_line + ['<eos>']
    tokens_alt.append(tokenized_line)
    max_length_alt = max(max_length_alt, len(tokenized_line))

# Pad sentences
for i in range(len(tokens_alt)):
    tokens_alt[i] += ['<pad>'] * (max_length_alt - len(tokens_alt[i]))

print("\n📌 Alternative: Padded Tokenized Sentences (spaCy only):\n")
for sentence in tokens_alt:
    print(sentence)

# Build vocabulary manually
from collections import Counter

# Collect all tokens
all_tokens = [token for sentence in tokens_alt for token in sentence]
token_counts = Counter(all_tokens)

# Create vocabulary dictionary
vocab_alt = {'<unk>': 0, '<pad>': 1, '<bos>': 2, '<eos>': 3}
for token, count in token_counts.items():
    if token not in vocab_alt:
        vocab_alt[token] = len(vocab_alt)

print("\n📘 Alternative Vocabulary (token → index):\n")
for token, idx in vocab_alt.items():
    print(f"{idx}: {token}")

# Convert to indices
indexed_sentences_alt = [[vocab_alt[token] for token in sentence] for sentence in tokens_alt]

print("\n🔢 Alternative: Tokenized Sentences as Indices:\n")
for i, indices in enumerate(indexed_sentences_alt):
    print(f"Sentence {i+1}: {indices}")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

📌 Alternative: Padded Tokenized Sentences (spaCy only):

['<bos>', 'IBM', 'taught', 'me', 'tokenization', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'Special', 'tokenizers', 'are', 'ready', 'and', 'they', 'will', 'blow', 'your', 'mind', '<eos>']

📘 Alternative Vocabulary (token → index):

0: <unk>
1: <p