In [1]:
from transformers import BertTokenizer, BertForMaskedLM
from transformers import RobertaTokenizer, RobertaForMaskedLM
from nltk.tokenize.treebank import TreebankWordDetokenizer

# from transformers import BertForQuestionAnswering, AutoModelForQuestionAnswering, AutoTokenizer
# from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering


import torch
import torch.nn as nn
import json
from copy import copy

In [2]:
import nltk
from nltk import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Use BERT or RoBERTa models

In [3]:
model_type = 'bert'

if model_type == 'bert':
    # Download (Using cased to maintain case in output)
    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
    model = BertForMaskedLM.from_pretrained('bert-large-cased')
    model_mask = '[MASK]'
    
elif model_type == 'roberta':
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForMaskedLM.from_pretrained('roberta-base')
    model_mask = '<mask>'
    

In [4]:
# from nltk.tokenize.treebank import TreebankWordDetokenizer

def bert_named_entity(sentence):
    global model_type
    
    sentence_token_ids = tokenizer.encode(sentence, return_tensors='pt')
    tonized_sentence = tokenizer.tokenize(sentence, return_tensors='pt')

    # Get and format positions of work masks
    mask_positions_2d = (sentence_token_ids.squeeze() == tokenizer.mask_token_id).nonzero()
    mask_positions = [mask.item() for mask in mask_positions_2d ]


    with torch.no_grad():
        output = model(sentence_token_ids)

    last_hidden_state = output[0].squeeze()

    
    token_predictions_list =[]
    for mask_index in mask_positions:
        mask_hidden_state = last_hidden_state[mask_index]
        # This isn't really required unless we want > 1 predicted word per mask. 
        idx = torch.topk(mask_hidden_state, k=5, dim=0)[1]
        
        words = None
        if model_type == 'roberta':
            words = [tokenizer.decode(i.item()).strip() for i in idx]
        elif model_type == 'bert':
            words = tokenizer.decode(idx).split(' ')

        token_predictions_list.append(words)
        
    # Make sure we have a list of predictions for each mask. 
    assert len(mask_positions) == len(token_predictions_list)

    # Replace masks with predicted words
    #   Make a copy so we can calculate shape differences, required for answer_start in QnA.
    predicted_sentence = copy(tonized_sentence) 
    for pos, new_word in zip(mask_positions, token_predictions_list):
        
        # Add the weird G at the front of predicted words, so they look like recognized tokens for detokenization. 
        if model_type == 'roberta':
            predicted_word = 'Ġ'+new_word[0]
        else:
            predicted_word = new_word[0]
        
        predicted_sentence[pos-1] = predicted_word

    return detokenize(predicted_sentence)

def detokenize(tokenized_text):
    global model_type
    punctuation="?:!.,;"
    
    detokenized_sentence=''
    for idx, token in enumerate(tokenized_text):
        if idx == 0:
            detokenized_sentence = token
        elif model_type == 'roberta' and token[0:1] != 'Ġ':
            detokenized_sentence += token
        elif token[0:2] == '##':
            detokenized_sentence += token[2:]
        elif token in punctuation:
            detokenized_sentence += token
        elif detokenized_sentence[-1:] == "'" or token[0:1] == "'":
            detokenized_sentence += token
        else:
            detokenized_sentence += ' '+token
        
    if model_type == 'roberta':
        detokenized_sentence = detokenized_sentence.replace('Ġ', '')
    
    return detokenized_sentence


# CC coordinating conjunction
# CD cardinal digit
# DT determiner
# EX existential there (like: "there is" ... think of it like "there exists")
# FW foreign word
# IN preposition/subordinating conjunction
# JJ adjective 'big'
# JJR adjective, comparative 'bigger'
# JJS adjective, superlative 'biggest'
# LS list marker 1)
# MD modal could, will
# NN noun, singular 'desk'
# NNS noun plural 'desks'
# NNP proper noun, singular 'Harrison'
# NNPS proper noun, plural 'Americans'
# PDT predeterminer 'all the kids'
# POS possessive ending parent's
# PRP personal pronoun I, he, she
# PRP$ possessive pronoun my, his, hers
# RB adverb very, silently,
# RBR adverb, comparative better
# RBS adverb, superlative best
# RP particle give up
# TO to go 'to' the store.
# UH interjection errrrrrrrm
# VB verb, base form take
# VBD verb, past tense took
# VBG verb, gerund/present participle taking
# VBN verb, past participle taken
# VBP verb, sing. present, non-3d take
# VBZ verb, 3rd person sing. present takes
# WDT wh-determiner which
# WP wh-pronoun who, what
# WP$ possessive wh-pronoun whose
# WRB wh-abverb where, when

# Pass sentence and list of parts of speach to be masked.
def mask_sentence_by_part(sentence, part):
    tokenized_sentence = word_tokenize(sentence)
    word_tags = nltk.pos_tag(tokenized_sentence)
    
    for idx, word_tag in enumerate(word_tags):
        if (word_tag[1] in part):
            tokenized_sentence[idx] = model_mask

    return TreebankWordDetokenizer().detokenize(tokenized_sentence)

def augment_language(sentence, part):
    masked_sentence = mask_sentence_by_part(sentence, part)
    return bert_named_entity(masked_sentence)

## Run It

In [5]:

paragraph = "It's a very nice holder - not too big and not too small. It fits any lipstick, lip gloss, chapstick, etc nicely. I love that I'm able to see what I have and not have to dig through a makeup bag anymore. I would highly recommend."
paragraph += " This product takes 10 minutes to setup 4. It is super easy. Very easy on the eyes. It makes the room look like you have a sound system. I used my simple SONY HTIB.The product is stong on the bottom so it's not going to tilt over. The extension part is a bit weak. I have it fully extended as I guess most people would like them. The product because to lose strength and stand a bit off centered. It leans to a side. I wouldn't return them! Just keep in mind that they are a great value and look great. Just has a clumsy feel to it. I tighten them every week when i see them leaning too much."
paragraph += " I have always kept a dustbuster in my kitchen for quick clean ups. They are always charged. Not with this new 18 volt model. Not only does is not have much suction power, you can't keep the battery plugged in so it is always charged and ready to go. If you don't use it for 4-5 days, the charge depletes! So then you have to recharge it for 14-16 hours. Not convenient at all. So I returned the dustbuster. I then ordered a Eureka that works great and can be left plugged in, permanently. The model number is H96. It is not as large as it looks in the photos and is a great buy; not to mention it costs less than the inefficient new dustbusters. The new dustbuster models are now just cheap knock offs of the original great machines. Black and Decker in their attempt to make a higher profit by cheapening a previously great product, destroyed a really good thing.I strongly recommend the Eureka below:Eureka Quick Up Cordless 2 in 1, 96Hproduct."
sentences = nltk.tokenize.sent_tokenize(paragraph)

for sentence in sentences:
     print("before: {}\n after: {}\n".format(sentence, augment_language(sentence, ['JJ', 'VB'])))

before: It's a very nice holder - not too big and not too small.
 after: It's a very good holder - not too big and not too small.

before: It fits any lipstick, lip gloss, chapstick, etc nicely.
 after: It fits any lipstick, lip gloss, chapstick, etc nicely.

before: I love that I'm able to see what I have and not have to dig through a makeup bag anymore.
 after: I love that I'm able to enjoy what I have and not have to go through a makeup bag anymore.

before: I would highly recommend.
 after: I would highly disagree.

before: This product takes 10 minutes to setup 4.
 after: This product takes 10 minutes to $. 4.

before: It is super easy.
 after: It is very rare.

before: Very easy on the eyes.
 after: Very easy on the eyes.

before: It makes the room look like you have a sound system.
 after: It makes the room look like you have a sound system.

before: I used my simple SONY HTIB.The product is stong on the bottom so it's not going to tilt over.
 after: I used my favorite SONY HTIB

## Take a look at tag types

In [6]:
tokenized_sentence = word_tokenize(paragraph)
word_tags = nltk.pos_tag(tokenized_sentence)

for foo in word_tags:
    print(foo)

('It', 'PRP')
("'s", 'VBZ')
('a', 'DT')
('very', 'RB')
('nice', 'JJ')
('holder', 'NN')
('-', ':')
('not', 'RB')
('too', 'RB')
('big', 'JJ')
('and', 'CC')
('not', 'RB')
('too', 'RB')
('small', 'JJ')
('.', '.')
('It', 'PRP')
('fits', 'VBZ')
('any', 'DT')
('lipstick', 'NN')
(',', ',')
('lip', 'NN')
('gloss', 'NN')
(',', ',')
('chapstick', 'NN')
(',', ',')
('etc', 'FW')
('nicely', 'RB')
('.', '.')
('I', 'PRP')
('love', 'VBP')
('that', 'IN')
('I', 'PRP')
("'m", 'VBP')
('able', 'JJ')
('to', 'TO')
('see', 'VB')
('what', 'WP')
('I', 'PRP')
('have', 'VBP')
('and', 'CC')
('not', 'RB')
('have', 'VB')
('to', 'TO')
('dig', 'VB')
('through', 'IN')
('a', 'DT')
('makeup', 'NN')
('bag', 'NN')
('anymore', 'RB')
('.', '.')
('I', 'PRP')
('would', 'MD')
('highly', 'RB')
('recommend', 'VB')
('.', '.')
('This', 'DT')
('product', 'NN')
('takes', 'VBZ')
('10', 'CD')
('minutes', 'NNS')
('to', 'TO')
('setup', 'VB')
('4', 'CD')
('.', '.')
('It', 'PRP')
('is', 'VBZ')
('super', 'JJ')
('easy', 'JJ')
('.', '.')
('Ver