# Working with Brown Corpus

In [1]:
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Categories

In [2]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

# Tokens

In [4]:
import pandas as pd
from google.colab import files
uploaded=files.upload()
data = pd.read_csv('brown.csv')
for i in range(len('tokenized_text')):
  words = nltk.word_tokenize(data['tokenized_text'][i])
  print(words)

Saving brown.csv to brown.csv
['Furthermore', ',', 'as', 'an', 'encouragement', 'to', 'revisionist', 'thinking', ',', 'it', 'manifestly', 'is', 'fair', 'to', 'admit', 'that', 'any', 'fraternity', 'has', 'a', 'constitutional', 'right', 'to', 'refuse', 'to', 'accept', 'persons', 'it', 'dislikes', '.']
['The', 'Unitarian', 'clergy', 'were', 'an', 'exclusive', 'club', 'of', 'cultivated', 'gentlemen', '--', 'as', 'the', 'term', 'was', 'then', 'understood', 'in', 'the', 'Back', 'Bay', '--', 'and', 'Parker', 'was', 'definitely', 'not', 'a', 'gentleman', ',', 'either', 'in', 'theology', 'or', 'in', 'manners', '.']
['Ezra', 'Stiles', 'Gannett', ',', 'an', 'honorable', 'representative', 'of', 'the', 'sanhedrin', ',', 'addressed', 'himself', 'frankly', 'to', 'the', 'issue', 'in', '1845', ',', 'insisting', 'that', 'Parker', 'should', 'not', 'be', 'persecuted', 'or', 'calumniated', 'and', 'that', 'in', 'this', 'republic', 'no', 'power', 'to', 'restrain', 'him', 'by', 'force', 'could', 'exist', '.']

# Size

In [5]:
data.count().sum()

401380

# Size of word tokens

In [6]:
data['tokenized_text'].count()

57340

# Word types

In [7]:
from nltk import pos_tag
pos_words = data['tokenized_text'].str.split().map(pos_tag)
pos_words.head()

0    [(Furthermore, RB), (,, ,), (as, IN), (an, DT)...
1    [(The, DT), (Unitarian, JJ), (clergy, NN), (we...
2    [(Ezra, NNP), (Stiles, NNP), (Gannett, NNP), (...
3    [(Even, RB), (so, RB), (,, ,), (Gannett, NNP),...
4    [(We, PRP), (today, NN), (are, VBP), (not, RB)...
Name: tokenized_text, dtype: object

# Size of category 'government'

In [8]:
size_of_cat_gov = 0
for i in range(len(data['label'])):
  if data['label'][i] == 'government':
    size_of_cat_gov += 1
print(size_of_cat_gov)

3032


# Frequency of tokens

In [9]:
frequency = {}
for i in range(len('tokenized_text')):
  words = nltk.word_tokenize(data['tokenized_text'][i])
# iterating over the list
  for item in words:
    # checking the element in dictionary
    if item in frequency:
        # incrementing the counr
        frequency[item] += 1
    else:
        # initializing the count
        frequency[item] = 1

# printing the frequency
print(frequency)

{'Furthermore': 1, ',': 29, 'as': 6, 'an': 5, 'encouragement': 1, 'to': 21, 'revisionist': 1, 'thinking': 1, 'it': 10, 'manifestly': 1, 'is': 5, 'fair': 1, 'admit': 1, 'that': 11, 'any': 1, 'fraternity': 2, 'has': 1, 'a': 8, 'constitutional': 1, 'right': 1, 'refuse': 1, 'accept': 1, 'persons': 1, 'dislikes': 1, '.': 12, 'The': 2, 'Unitarian': 1, 'clergy': 1, 'were': 1, 'exclusive': 1, 'club': 1, 'of': 13, 'cultivated': 1, 'gentlemen': 1, '--': 7, 'the': 30, 'term': 1, 'was': 2, 'then': 2, 'understood': 1, 'in': 9, 'Back': 1, 'Bay': 1, 'and': 11, 'Parker': 14, 'definitely': 1, 'not': 5, 'gentleman': 1, 'either': 1, 'theology': 1, 'or': 3, 'manners': 1, 'Ezra': 1, 'Stiles': 1, 'Gannett': 4, 'honorable': 1, 'representative': 1, 'sanhedrin': 1, 'addressed': 1, 'himself': 1, 'frankly': 1, 'issue': 1, '1845': 1, 'insisting': 1, 'should': 3, 'be': 3, 'persecuted': 1, 'calumniated': 1, 'this': 3, 'republic': 1, 'no': 2, 'power': 1, 'restrain': 1, 'him': 2, 'by': 3, 'force': 1, 'could': 3, 'exi

# Most frequent token

In [10]:
print(max(frequency))

would


# Number of sentences

In [11]:
data['tokenized_text'].count()

57340

# Exploring other NLTK Corpora

## Plaintext corpora

In [12]:
 nltk.download('abc')
 nltk.download('genesis')
 nltk.download('gutenberg')
 nltk.download('inaugural')
 nltk.download('state_union')
 nltk.download('webtext')

print(nltk.corpus.abc.words())
print(nltk.corpus.genesis.words())
print(nltk.corpus.gutenberg.words(fileids='austen-emma.txt'))
print(nltk.corpus.inaugural.words())
print(nltk.corpus.state_union.words())
print(nltk.corpus.webtext.words())

[nltk_data] Downloading package abc to /root/nltk_data...
[nltk_data]   Unzipping corpora/abc.zip.
[nltk_data] Downloading package genesis to /root/nltk_data...
[nltk_data]   Unzipping corpora/genesis.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Unzipping corpora/inaugural.zip.
[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.
[nltk_data] Downloading package webtext to /root/nltk_data...


['PM', 'denies', 'knowledge', 'of', 'AWB', 'kickbacks', ...]
['In', 'the', 'beginning', 'God', 'created', 'the', ...]
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]
['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', ...]
['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", 'S', ...]
['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ...]


[nltk_data]   Unzipping corpora/webtext.zip.


## Tagged corpora

In [None]:
from nltk.corpus import brown
print(brown.words())
print(brown.tagged_words())
print(brown.sents())
print(brown.tagged_sents())
print(brown.paras(categories='reviews'))
print(brown.tagged_paras(categories='reviews'))

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
[('The', 'AT'), ('Fulton', 'NP-TL'), ...]
[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]
[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), (

## Chunked corpora

In [13]:
 nltk.download('conll2000')
 nltk.download('conll2002')

from nltk.corpus import conll2000, conll2002
print(conll2000.sents())
for tree in conll2000.chunked_sents()[:2]:
  print(tree)
print(conll2002.sents())
for tree in conll2002.chunked_sents()[:2]:
  print(tree)

[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package conll2002 to /root/nltk_data...


[['Confidence', 'in', 'the', 'pound', 'is', 'widely', 'expected', 'to', 'take', 'another', 'sharp', 'dive', 'if', 'trade', 'figures', 'for', 'September', ',', 'due', 'for', 'release', 'tomorrow', ',', 'fail', 'to', 'show', 'a', 'substantial', 'improvement', 'from', 'July', 'and', 'August', "'s", 'near-record', 'deficits', '.'], ['Chancellor', 'of', 'the', 'Exchequer', 'Nigel', 'Lawson', "'s", 'restated', 'commitment', 'to', 'a', 'firm', 'monetary', 'policy', 'has', 'helped', 'to', 'prevent', 'a', 'freefall', 'in', 'sterling', 'over', 'the', 'past', 'week', '.'], ...]
(S
  (NP Confidence/NN)
  (PP in/IN)
  (NP the/DT pound/NN)
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  (NP another/DT sharp/JJ dive/NN)
  if/IN
  (NP trade/NN figures/NNS)
  (PP for/IN)
  (NP September/NNP)
  ,/,
  due/JJ
  (PP for/IN)
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  (VP fail/VB to/TO show/VB)
  (NP a/DT substantial/JJ improvement/NN)
  (PP from/IN)
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near

[nltk_data]   Unzipping corpora/conll2002.zip.
