# N-Gram Counter

In [1]:
from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Code to download corpora
import nltk
nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\nlpza\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nlpza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nlpza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nlpza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
article = reuters.raw(reuters.fileids(categories='cpi')[2])
print(article)

HUNGARY RAISES PRICES IN EFFORT TO CURB DEFICIT
  Hungary has announced sharp price
  increases for a range of food and consumer products as part of
  its efforts to curb a soaring budget deficit.
      The official MTI news agency said the government decided
  consumer price subsidies had to be cut to reduce state
  spending. From today the price of meat will rise by an average
  18 pct and that of beer and spirits by 10 pct, MTI said.
      MTI said consumer goods will also become more expensive,
  with the price of refrigerators rising some five pct. It also
  announced a number of measures to ease hardship, including
  higher pensions and family allowances.
      Statistics indicate the budget deficit tripled in 1986 to
  47 billion forints. Central banker Janos Fekete has said the
  Finance Ministry is trying to cut the 1987 shortfall to between
  30 and 35 billion from a planned 43.8 billion.
      A major tax reform, including the introduction of a
  Western-style valued added t

In [4]:
def process_text(doc):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', article)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return output

In [5]:
processed = process_text(article)
print(processed)

['hungary', 'raises', 'prices', 'effort', 'curb', 'deficit', 'hungary', 'ha', 'announced', 'sharp', 'price', 'increase', 'range', 'food', 'consumer', 'product', 'part', 'effort', 'curb', 'soaring', 'budget', 'deficit', 'official', 'mti', 'news', 'agency', 'said', 'government', 'decided', 'consumer', 'price', 'subsidy', 'cut', 'reduce', 'state', 'spending', 'today', 'price', 'meat', 'rise', 'average', 'pct', 'beer', 'spirit', 'pct', 'mti', 'said', 'mti', 'said', 'consumer', 'good', 'also', 'become', 'expensive', 'price', 'refrigerator', 'rising', 'five', 'pct', 'also', 'announced', 'number', 'measure', 'ease', 'hardship', 'including', 'higher', 'pension', 'family', 'allowance', 'statistics', 'indicate', 'budget', 'deficit', 'tripled', 'billion', 'forint', 'central', 'banker', 'janos', 'fekete', 'ha', 'said', 'finance', 'ministry', 'trying', 'cut', 'shortfall', 'billion', 'planned', 'billion', 'major', 'tax', 'reform', 'including', 'introduction', 'westernstyle', 'valued', 'added', 'tax'

## Frequency Analysis: Word Counts

In [6]:
from collections import Counter

In [7]:
# get word counts
word_counts = Counter(processed)
print(dict(word_counts))

{'hungary': 2, 'raises': 1, 'prices': 1, 'effort': 3, 'curb': 2, 'deficit': 4, 'ha': 2, 'announced': 2, 'sharp': 2, 'price': 5, 'increase': 1, 'range': 1, 'food': 1, 'consumer': 3, 'product': 1, 'part': 1, 'soaring': 1, 'budget': 2, 'official': 1, 'mti': 3, 'news': 1, 'agency': 1, 'said': 7, 'government': 1, 'decided': 1, 'subsidy': 1, 'cut': 2, 'reduce': 1, 'state': 3, 'spending': 2, 'today': 1, 'meat': 1, 'rise': 2, 'average': 1, 'pct': 3, 'beer': 1, 'spirit': 1, 'good': 2, 'also': 4, 'become': 1, 'expensive': 2, 'refrigerator': 1, 'rising': 1, 'five': 1, 'number': 1, 'measure': 2, 'ease': 1, 'hardship': 1, 'including': 2, 'higher': 1, 'pension': 1, 'family': 1, 'allowance': 1, 'statistics': 1, 'indicate': 1, 'tripled': 1, 'billion': 3, 'forint': 1, 'central': 1, 'banker': 1, 'janos': 1, 'fekete': 1, 'finance': 1, 'ministry': 1, 'trying': 1, 'shortfall': 1, 'planned': 2, 'major': 1, 'tax': 2, 'reform': 1, 'introduction': 1, 'westernstyle': 1, 'valued': 1, 'added': 1, 'january': 1, 'c

In [8]:
# get top x words
print(dict(word_counts.most_common(20)))

{'said': 7, 'price': 5, 'deficit': 4, 'also': 4, 'effort': 3, 'consumer': 3, 'mti': 3, 'state': 3, 'pct': 3, 'billion': 3, 'diplomat': 3, 'hungary': 2, 'curb': 2, 'ha': 2, 'announced': 2, 'sharp': 2, 'budget': 2, 'cut': 2, 'spending': 2, 'rise': 2}


## Frequency Analysis: N-gram Counts

In [9]:
bigram_counts = Counter(ngrams(processed, n=2))
print(dict(bigram_counts))

{('hungary', 'raises'): 1, ('raises', 'prices'): 1, ('prices', 'effort'): 1, ('effort', 'curb'): 2, ('curb', 'deficit'): 1, ('deficit', 'hungary'): 1, ('hungary', 'ha'): 1, ('ha', 'announced'): 1, ('announced', 'sharp'): 1, ('sharp', 'price'): 2, ('price', 'increase'): 1, ('increase', 'range'): 1, ('range', 'food'): 1, ('food', 'consumer'): 1, ('consumer', 'product'): 1, ('product', 'part'): 1, ('part', 'effort'): 1, ('curb', 'soaring'): 1, ('soaring', 'budget'): 1, ('budget', 'deficit'): 2, ('deficit', 'official'): 1, ('official', 'mti'): 1, ('mti', 'news'): 1, ('news', 'agency'): 1, ('agency', 'said'): 1, ('said', 'government'): 1, ('government', 'decided'): 1, ('decided', 'consumer'): 1, ('consumer', 'price'): 1, ('price', 'subsidy'): 1, ('subsidy', 'cut'): 1, ('cut', 'reduce'): 1, ('reduce', 'state'): 1, ('state', 'spending'): 2, ('spending', 'today'): 1, ('today', 'price'): 1, ('price', 'meat'): 1, ('meat', 'rise'): 1, ('rise', 'average'): 1, ('average', 'pct'): 1, ('pct', 'beer')

In [10]:
print(dict(bigram_counts.most_common(10)))

{('effort', 'curb'): 2, ('sharp', 'price'): 2, ('budget', 'deficit'): 2, ('state', 'spending'): 2, ('mti', 'said'): 2, ('diplomat', 'said'): 2, ('hungary', 'raises'): 1, ('raises', 'prices'): 1, ('prices', 'effort'): 1, ('curb', 'deficit'): 1}
