In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import string

### Step 1: Identify Text to Use

* I'm using a book, but long term goal would be easy-to-consume articles

In [2]:
file = open('test_article.txt', 'r') 

In [3]:
data = file.readlines()

In [4]:
data[1]

"May is quiet in government meetings, too. \\'93She sits, you talk. She sits. She looks at you, and then you leave,\\'94 a former Cabinet colleague told me recently. May\\'92s preferred method of communicating with the public is in the form of long speeches, which she delivers with a certain steel. She can land a joke, if she has time to prepare. But when she is forced to speak off the cuff, in Parliament or to the press, her body stiffens and she takes deep breaths. She has a wide, expressive mouth that cracks into grimaces and betrays an inner tumult, while the sentences that emerge are frequently circular and devoid of clear meaning.\\\n"

In [5]:
sent_tokens = []
for i in data:
    sent_tokens.append(sent_tokenize(i))

In [6]:
tokens = [sent for sent in map(word_tokenize, data)]

In [7]:
tokens_lower = [[word.lower() for word in sent]
                 for sent in tokens]

In [8]:
punctuation_ = set(string.punctuation)

def filter_tokens(sent):
    return([w for w in sent if not w in punctuation_])

In [9]:
tokens_filtered = list(map(filter_tokens, tokens_lower))
tokens_filtered[0]

['the',
 'british',
 'prime',
 'minister',
 'theresa',
 'may',
 'often',
 'strikes',
 'people',
 'as',
 'cautious',
 'but',
 'her',
 'political',
 'career',
 'has',
 'been',
 'defined',
 'by',
 'acts',
 'of',
 'boldness',
 'often',
 'on',
 'behalf',
 'of',
 'unfashionable',
 'causes',
 'or',
 'in',
 'the',
 'face',
 'of',
 'seemingly',
 'impossible',
 'circumstances',
 'the',
 'misconception',
 'arises',
 'in',
 'part',
 'because',
 'she',
 'is',
 'an',
 'awkward',
 'person',
 'may',
 'who',
 'is',
 'sixty-one',
 'is',
 'tall',
 'and',
 'stooped',
 'serious',
 'and',
 'shy',
 'since',
 'she',
 'was',
 'elected',
 'to',
 'parliament',
 'in',
 'the',
 'late',
 'nineteen-nineties',
 'she',
 'has',
 'dressed',
 'in',
 'sharp',
 'eye-catching',
 'clothes',
 'as',
 'if',
 'to',
 'offset',
 'the',
 'fact',
 'that',
 'she',
 'is',
 'not',
 'personally',
 'vivacious',
 'but',
 'the',
 'effect',
 'is',
 'often',
 'to',
 'accentuate',
 'what',
 'is',
 'not',
 'there',
 'may',
 "doesn\\'92t",
 'sa

### Step 2: Identify vocab words to substitute

* Identify words, research word2vec and how to evaluate the word
* Look at synonyms and antonyms
* https://www.geeksforgeeks.org/get-synonymsantonyms-nltk-wordnet-python/
* http://www.nltk.org/howto/wordnet.html

In [10]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [80]:
def close_words(my_word):

    synonyms = [] 
    antonyms = [] 

    for syn in wordnet.synsets(str(my_word)): 
        for l in syn.lemmas(): 
            synonyms.append(l.name()) 
            if l.antonyms(): 
                antonyms.append(l.antonyms()[0].name()) 
    return set(synonyms)

In [82]:
synonyms = close_words("nice")
synonyms

{'Nice',
 'courteous',
 'dainty',
 'decent',
 'gracious',
 'nice',
 'overnice',
 'prissy',
 'skillful',
 'squeamish'}

In [83]:
def measured_words(my_word, other_word):
    synonyms = close_words(my_word)
    w2 = wordnet.synsets(other_word)
    for word in synonyms:
        w1 = wordnet.synsets(word)
        for one in w1:
            for two in w2:
                print( "%s\t %s\t : %s" % (one.name, two.name, wordnet.path_similarity(one, two)))

In [84]:
measured_words("nice", "bad")

<bound method Synset.name of Synset('dainty.s.04')>	 <bound method Synset.name of Synset('bad.n.01')>	 : None
<bound method Synset.name of Synset('dainty.s.04')>	 <bound method Synset.name of Synset('bad.a.01')>	 : None
<bound method Synset.name of Synset('dainty.s.04')>	 <bound method Synset.name of Synset('bad.s.02')>	 : None
<bound method Synset.name of Synset('dainty.s.04')>	 <bound method Synset.name of Synset('bad.s.03')>	 : None
<bound method Synset.name of Synset('dainty.s.04')>	 <bound method Synset.name of Synset('bad.s.04')>	 : None
<bound method Synset.name of Synset('dainty.s.04')>	 <bound method Synset.name of Synset('regretful.a.01')>	 : None
<bound method Synset.name of Synset('dainty.s.04')>	 <bound method Synset.name of Synset('bad.s.06')>	 : None
<bound method Synset.name of Synset('dainty.s.04')>	 <bound method Synset.name of Synset('bad.s.07')>	 : None
<bound method Synset.name of Synset('dainty.s.04')>	 <bound method Synset.name of Synset('bad.s.08')>	 : None
<bou