## This is a Jupyter notebook with examples of using NLP for basic text mining.

In [1]:
# Import all libraries and functions

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

engstopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    """Utility function to convert pos_tag part-of-speech tags 
    to WordNet tags for use with lemmatizer. 
    """
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None # for easy if-statement 

def tokenize(text):
    """Takes raw text, tokenizes it, removes stopwords 
    and words that are one-character long (like 'a' and punctuation.
    Returns a list of tokens.
    """
    
    tokens = nltk.tokenize.word_tokenize(text)
    tokens = [word.lower() for word in tokens if len(word) > 1]
    tokens = [word for word in tokens if word not in engstopwords]
    return tokens

def get_pos(tokens):
    """Takes a token list and gets the part-of-speech for each.
    Returns a list of tuples ('word', 'POS')
    """
    
    tagged = nltk.pos_tag(tokens)
    return tagged
  
def lemmatize(tagged_words):
    """Lemmatizes words. That is, it normalizes words to their most
    basic form. For example, 'is', 'am' and 'are' are merged into 'be'.
    
    Returns a list of lemmarized tokens.
    
    Lemmatization is confined to parts of speech. Verb variations are lemmatized
    to the root verb, and same with nouns.
    
    Example: 'continues' and 'continuing' are merged to 'continue'
    but 'continuation' stays the same.
    """
    
    lemma_list = []
    for word, tag in tagged_words:
        wntag = get_wordnet_pos(tag)
        if wntag is None:
            lemma = lemmatizer.lemmatize(word) 
        else:
            lemma = lemmatizer.lemmatize(word, pos=wntag)
        lemma_list.append(lemma)
    return lemma_list
  
def get_distfreq(tokens, top_n):
    """ Finds the most common words in a token list.
    Returns a list of tuples ('word', frequency)
    """
    
    fdist = nltk.FreqDist(tokens)
    return fdist.most_common(top_n)

In [2]:
# Load raw text file. This is the campaign platform for a municipal
# party gunning for the Montreal mayorship
# It has been converted to simple text from PDF using CometDocs.

with open('A_STRONG_PLAN_FOR_A_BOLD_CITY.txt', encoding='utf-8') as f1:
    coderre = f1.read()

In [5]:
# Tokenize Coderre text and print first 40 tokens

coderre_tokens = tokenize(coderre)
coderre_tokens[:40]

['\ufeffcontinuing',
 'progress',
 'together',
 'strong',
 'plan',
 'sustainable',
 'city',
 'electoral',
 'platform',
 '2017-2021',
 'dear',
 'friends',
 'last',
 'four',
 'years',
 'proven',
 'anything',
 'it’s',
 'working',
 'together',
 'nothing',
 'impossible',
 'since',
 'arrival',
 'city',
 'hall',
 'many',
 'achievements',
 'result',
 'strong',
 'commitment',
 'elected',
 'officials',
 'teamed',
 'community',
 'leaders',
 'montréal',
 'civil',
 'society',
 'everyone']

In [7]:
# Get the part of speech for each token

coderre_pos = get_pos(coderre_tokens)
coderre_pos[:20]

[('\ufeffcontinuing', 'VBG'),
 ('progress', 'NN'),
 ('together', 'RB'),
 ('strong', 'JJ'),
 ('plan', 'NN'),
 ('sustainable', 'JJ'),
 ('city', 'NN'),
 ('electoral', 'JJ'),
 ('platform', 'NN'),
 ('2017-2021', 'JJ'),
 ('dear', 'JJ'),
 ('friends', 'NNS'),
 ('last', 'JJ'),
 ('four', 'CD'),
 ('years', 'NNS'),
 ('proven', 'RB'),
 ('anything', 'NN'),
 ('it’s', 'JJ'),
 ('working', 'VBG'),
 ('together', 'RB')]

In [9]:
# Lemmatize the tokens using the wordnet POS converter function and get 10 most common words

coderre_lemma = lemmatize(coderre_pos)
get_distfreq(coderre_lemma, 10)

[('montréal', 84),
 ('city', 75),
 ('continue', 53),
 ('together', 43),
 ('development', 36),
 ('new', 33),
 ('project', 28),
 ('develop', 28),
 ('work', 27),
 ('social', 26)]