# Counter
In this activity, you will create a function that preprocesses and outputs a list of the most common words in a corpus.

In [10]:
from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
from collections import Counter

In [2]:
lemmatizer = WordNetLemmatizer()

In [7]:
# Corpus - list of articles about grains
ids = reuters.fileids(categories='grain')
corpus = [reuters.raw(i) for i in ids]

In [14]:
# Define preprocess function
def process_text(doc):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', doc)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return output

In [30]:
# Define the counter function
def word_counter(corpus): 
    # Combine all articles in corpus into one large string
    big_string = ' '.join(corpus)
    processed = process_text(big_string)
    top_10 = dict(Counter(processed).most_common(10))
    return pd.DataFrame(list(top_10.items()), columns=['word', 'count'])

In [31]:
word_counter(corpus)

Unnamed: 0,word,count
0,said,1803
1,tonne,1224
2,mln,1080
3,wheat,998
4,us,842
5,grain,704
6,corn,579
7,year,549
8,pct,472
9,export,443


In [32]:
def bigram_counter(corpus): 
    # Combine all articles in corpus into one large string
    big_string = ' '.join(corpus)
    processed = process_text(big_string)
    bigrams = ngrams(processed, n=2)
    top_10 = dict(Counter(bigrams).most_common(10))
    return pd.DataFrame(list(top_10.items()), columns=['bigram', 'count'])

In [33]:
bigram_counter(corpus)

Unnamed: 0,bigram,count
0,"(mln, tonne)",482
1,"(nil, nil)",204
2,"(us, agriculture)",161
3,"(department, said)",147
4,"(agriculture, department)",140
5,"(source, said)",129
6,"(soviet, union)",119
7,"(official, said)",117
8,"(mln, dlrs)",115
9,"(last, year)",111
