In [3]:
import nltk, ssl, os
from nltk.corpus import brown, inaugural, reuters, udhr
from nltk.probability import ConditionalFreqDist
from nltk.tag import UnigramTagger
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import WordPunctTokenizer

print("\nBrown categories:", brown.categories()[:10])
print("Brown sample words:", brown.words()[:15])
print("Brown sample sentence:", brown.sents()[0])

print("\nInaugural files:", inaugural.fileids()[:3])
print("Inaugural 2009 sample:", inaugural.words('2009-Obama.txt')[:15])

print("\nReuters categories:", reuters.categories()[:8])
print("Reuters sample words:", reuters.words('training/9865')[:10])

print("\nUDHR languages:", udhr.fileids()[:5])
print("UDHR English sample:", udhr.words('English-Latin1')[:12])

print("\n===== CUSTOM CORPUS =====")

root = "mycorpus"
os.makedirs(root + "/sports", exist_ok=True)
os.makedirs(root + "/tech", exist_ok=True)

with open(root + "/sports/s1.txt", "w") as f:
    f.write("The team won the match. Players performed well.")
with open(root + "/tech/t1.txt", "w") as f:
    f.write("Artificial intelligence drives modern innovation.")

tokenizer = WordPunctTokenizer()
mycorpus = PlaintextCorpusReader(root, r".*\.txt", word_tokenizer=tokenizer)

print("Corpus files:", mycorpus.fileids())

# Conditional Frequency Distribution
cfd = ConditionalFreqDist(
    (file.split('/')[0], word.lower())
    for file in mycorpus.fileids()
    for word in mycorpus.words(file)
)

print("\nSports freq:", cfd["sports"].most_common(5))
print("Tech freq:", cfd["tech"].most_common(5))

print("\n===== TAGGED CORPUS =====")

sample = mycorpus.words()[:30]
tagged_words = nltk.pos_tag(sample, lang="eng")
print("Tagged words:", tagged_words)

tagged_sents = [nltk.pos_tag(sent, lang="eng") for sent in mycorpus.sents()]
print("Tagged sentence sample:", tagged_sents[0])

# Most frequent noun tags
noun_tags = ["NN", "NNS", "NNP", "NNPS"]
noun_freq = nltk.FreqDist(tag for (_, tag) in tagged_words if tag in noun_tags)

print("\nMost frequent nouns:", noun_freq.most_common())

print("\n===== TAGGERS =====")

# Rule-based Tagger
default_tagger = nltk.DefaultTagger("NN")
print("Rule-based:", default_tagger.tag(["This", "is", "a", "test"]))

# Unigram Tagger
train_size = int(len(tagged_sents) * 0.8)
train_sents = tagged_sents[:train_size]
test_sents = tagged_sents[train_size:]

unigram_tagger = UnigramTagger(train_sents, backoff=default_tagger)

if test_sents:
    print("Unigram accuracy:", unigram_tagger.evaluate(test_sents))

print("Unigram tagging example:",
      unigram_tagger.tag("AI is changing the world".split()))



Brown categories: ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery']
Brown sample words: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced']
Brown sample sentence: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']

Inaugural files: ['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt']
Inaugural 2009 sample: ['My', 'fellow', 'citizens', ':', 'I', 'stand', 'here', 'today', 'humbled', 'by', 'the', 'task', 'before', 'us', ',']

Reuters categories: ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut']
Reuters sample words: ['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', 'BIDS', 'DETAILED', 'French', 'operator

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Unigram accuracy:", unigram_tagger.evaluate(test_sents))
