In [6]:
import nltk
from nltk.corpus import gutenberg, brown, reuters, movie_reviews, stopwords
from collections import Counter

In [7]:
# Download required data
nltk.download(['gutenberg', 'brown', 'reuters', 'movie_reviews', 'stopwords'], quiet=True)

True

In [9]:
def explore_nltk_corpora():
    """Short function to explore major NLTK corpora"""
    
    print("NLTK Corpora Quick Explorer")
    print("=" * 40)
    
    # Gutenberg Corpus
    print(f"\n1. GUTENBERG: {len(gutenberg.fileids())} classic texts")
    print(f"   Sample: {gutenberg.fileids()[:3]}")
    
    # Brown Corpus
    print(f"\n2. BROWN: {len(brown.categories())} categories")
    print(f"   Categories: {brown.categories()[:5]}...")
    
    # Reuters Corpus
    print(f"\n3. REUTERS: {len(reuters.fileids())} documents")
    print(f"   Top categories: {Counter(cat for doc in reuters.fileids() 
                                      for cat in reuters.categories(doc)).most_common(3)}")
    
    # Movie Reviews
    print(f"\n4. MOVIE REVIEWS: {movie_reviews.categories()}")
    for cat in movie_reviews.categories():
        print(f"   {cat}: {len(movie_reviews.fileids(cat))} reviews")
    
    # Stopwords
    print(f"\n5. STOPWORDS: {len(stopwords.fileids())} languages")
    print(f"   English stopwords (first 10): {stopwords.words('english')[:10]}")

In [10]:
# Run the explorer
explore_nltk_corpora()

# Quick analysis example
print(f"\nQuick Analysis Example:")
print("=" * 25)
sample_text = gutenberg.raw('austen-emma.txt')[:500]  # First 500 chars
words = nltk.word_tokenize(sample_text.lower())
print(f"Sample text length: {len(sample_text)} characters")
print(f"Word count: {len(words)}")
print(f"Most common words: {Counter(words).most_common(5)}")

NLTK Corpora Quick Explorer

1. GUTENBERG: 18 classic texts
   Sample: ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt']

2. BROWN: 15 categories
   Categories: ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government']...

3. REUTERS: 10788 documents
   Top categories: [('earn', 3964), ('acq', 2369), ('money-fx', 717)]

4. MOVIE REVIEWS: ['neg', 'pos']
   neg: 1000 reviews
   pos: 1000 reviews

5. STOPWORDS: 32 languages
   English stopwords (first 10): ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

Quick Analysis Example:
Sample text length: 500 characters
Word count: 101
Most common words: [(',', 8), ('of', 6), ('and', 4), ('the', 4), ('a', 3)]
