In [3]:
import nltk
import re
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from py_thesaurus import WordAnalyzer
import pandas as pd

In [4]:
def initialize():
    with open('dictionary.txt','r') as response:
        raw = response.read()
    raw = re.sub("[^a-zA-Z]", " ",raw)
    tokens = word_tokenize(raw)
    text = nltk.Text(tokens)
    return (text)

In [5]:
def synonym_wn (input, output, form = None):
    '''
    Function to provide synonyms for words from the Wordnet corpus.
    
    Input should be a string
    Form refers to the parts of speech, which by default is None. Options include:
    'n'    NOUN 
    'v'    VERB 
    'a'    ADJECTIVE 
    's'    ADJECTIVE SATELLITE 
    'r'    ADVERB 
    
    output takes an integer denoting number of synonyms to output
    '''
    a = []
    synon = []
    counter = 0
    for i,synset in enumerate(wn.synsets(input, form)):
        a.append(synset.lemma_names())

    for lis in a:
        for item in lis:
            if item not in synon: 
                if counter < output: 
                    synon.append(item)
                    counter += 1
    return(synon)
    #return(list(set(synon)))

In [6]:
def synonym_the (input, output):
    '''
    Function to provide synonyms for words from Thesaurus.com.
    Output takes an integer denoting number of synonyms to output
    '''
    a = WordAnalyzer(input).get_synonym()
    synon = []
    
    for i, item in enumerate(a): 
        if i < output: synon.append(item)
    return(list(set(synon)))

In [7]:
def synonym (input, output = 20):
    '''
    Returns a pandas table with the synonyms from the Thesaurus and Wordnet datasets.
    Input should be a string
    Output is the maximum number of synonyms returned. Should be an int
    '''
    text = initialize()
    df = pd.DataFrame({'The_count': 0,
                       'Wordnet' : pd.Series(synonym_wn(input, output)),
                       'Thesaurus' : pd.Series(synonym_the(input, output)),
                      'WN_count': 0})
    for ind in df.index:    
        df.loc[ind,('The_count')] = text.count(df['Thesaurus'][ind])
        df.loc[ind,('WN_count')] = text.count(df['Wordnet'][ind])

    df = df[['Thesaurus', 'The_count', 'Wordnet', 'WN_count']]    
    return(df)