In [29]:
import nltk
import json, os
import random
from nltk.corpus import brown

# Download if not already
nltk.download('brown')
nltk.download('universal_tagset')

# Get POS-tagged sentences with the 'universal' tagset
tagged_sentences = brown.tagged_sents(tagset='universal')

print(f"{'Number of sentences: ':>25}", len(tagged_sentences))
print(f"{'Example random sentence: ':>25}", tagged_sentences[random.randint(0, len(tagged_sentences)-1)])

# Shuffle data for randomness
random.shuffle(list(tagged_sentences))

# Split sizes
n_total = len(tagged_sentences)
n_train = int(0.85 * n_total)
n_val = int(0.10 * n_total)
n_test = n_total - n_train - n_val  # to cover rounding

train_set = tagged_sentences[:n_train]
val_set = tagged_sentences[n_train:n_train+n_val]
test_set = tagged_sentences[n_train+n_val:]

print(f"Train size: {len(train_set)}, Val size: {len(val_set)}, Test size: {len(test_set)}")

# Convert to serializable format
def convert(data):
    return [[{word: tag} for word, tag in sentence] for sentence in data]

# Save to JSON files
splits = {
    "train-data.json": convert(train_set),
    "val-data.json": convert(val_set),
    "test-data.json": convert(test_set),
}

for path, data in splits.items():
    with open(path, "w") as f:
        json.dump(data, f, indent=2)   # indent=2 for readability
    print(f"Saved {path}")

[nltk_data] Downloading package brown to /home/sidsr/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/sidsr/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


    Number of sentences:  57340
Example random sentence:  [('Then', 'ADV'), ('Mel', 'NOUN'), ('Chandler', 'NOUN'), ('started', 'VERB'), ('up', 'ADP'), ('the', 'DET'), ('hill', 'NOUN'), ('.', '.')]
Train size: 48739, Val size: 5734, Test size: 2867
Saved train-data.json
Saved val-data.json
Saved test-data.json
