In [1]:
import pickle
import os
import sys
from datetime import datetime
import threading
import math
import json
import torch
import re
from nltk.corpus import stopwords
import urllib

### Embedding computation for Chinese text

In [None]:
def create_bert_emb(all_sents, tok_pooling='mean', get_cls_emb=False):
    if len(all_sents) > 0:
        with torch.cuda.device(0):
            all_toks = emb_tokenizer.batch_encode_plus(all_sents, padding='longest',\
                                                   add_special_tokens=True)
            tok_tensor = torch.tensor(all_toks['input_ids']).to('cuda')
            tok_tensor = tok_tensor[:, :512]
            with torch.no_grad():
                model_out = emb_model(tok_tensor)
                all_doc_tensor = model_out[0]
                if get_cls_emb:
                    all_doc_tensor = model_out[1]
                all_doc_tensor = all_doc_tensor.to('cpu')
            if get_cls_emb:
                return all_doc_tensor
            all_attn_mask = torch.tensor(all_toks['attention_mask'])
            ret_tensor = torch.FloatTensor(all_doc_tensor.size(0), all_doc_tensor.size(-1))
            for i in range(all_doc_tensor.size(0)):
                slen = torch.sum(all_attn_mask[i, :])
                if tok_pooling == 'mean':
                    ret_tensor[i, :] = torch.mean(all_doc_tensor[i, :slen, :], dim=0)
                elif tok_pooling == 'sum':
                    ret_tensor[i, :] = torch.sum(all_doc_tensor[i, :slen, :], dim=0)
                else:
                    return 'invalid tok pooling'
            return ret_tensor

In [None]:
def batchify(all_sents, batch_size=100):
    batches = []
    beg = 0
    end = batch_size
    while beg < len(all_sents):
        batches.append(all_sents[beg:end])
        beg = end
        end += batch_size
    return batches

In [None]:
from transformers import BertTokenizer, BertModel
emb_tokenizer_class = BertTokenizer
emb_tokenizer = emb_tokenizer_class.from_pretrained('hfl/chinese-bert-wwm')
with torch.cuda.device(0):
    with torch.no_grad():
        emb_model = BertModel.from_pretrained('hfl/chinese-bert-wwm',\
                                          output_hidden_states=False,\
                                          output_attentions=False)
        emb_model.eval()
        emb_model.to('cuda')

### Loading data

In [2]:
dir_path = '/homes/rpujari/scratch1_fortytwo/DARPA/'

In [3]:
metadata = json.load(open(dir_path + 'mpdd/metadata.json'))
dialogue = json.load(open(dir_path + 'mpdd/dialogue.json'))

In [4]:
all_utterances = []
utterance_ids = []
for conv_id in dialogue:
    conv = dialogue[conv_id]
    for i, turn in enumerate(conv):
#         for key in turn:
#             print(key, turn[key])
        all_utterances.append(turn['utterance'])
        utterance_ids.append(conv_id + '-' + str(i))
print(len(utterance_ids), len(all_utterances))

25548 25548


In [None]:
utterance_batches = batchify(all_utterances)
batch_utterance_embs = []
t1 = datetime.now()
for i, batch in enumerate(utterance_batches):
    batch_emb = create_bert_emb(batch)
    batch_utterance_embs.append(batch_emb)
    t2 = datetime.now()
    print(i + 1, '/', len(utterance_batches), 'done,', t2-t1)

In [None]:
utterance_embs = torch.cat(batch_utterance_embs, dim=0)
print(utterance_embs.size())
with open(dir_path + 'mpdd/bert-base-utterance-embs.pkl', 'wb') as outfile:
    pickle.dump((utterance_embs, utterance_ids), outfile)

### Translating to English
#### Argostranslate: https://www.argosopentech.com/argospm/index/
#### huggingface transformers

In [5]:
sent1 = '那個憨女人有什麼值得送的，正鵬這個人也真是的！'
sent2 = '哎喲，老婆子，你怎麼盡講那些不利於團結的話呢！他去送送他的同學也在情理之中嘛！'
sent3 = '爸、媽，我回來啦！'

In [6]:
from argostranslate import package, translate
package.install_from_path(dir_path + 'translate-zh_en-1_1.argosmodel')
installed_languages = translate.get_installed_languages()
print([str(lang) for lang in installed_languages])
translation_zh_en = installed_languages[1].get_translation(installed_languages[0])

['English', 'Chinese']


### Original translations given in the paper

##### What is Zheng-Peng thinking? He has no need to send the silly woman home.
##### Hey. My old woman. How can you say such uncoordinated words? It’s reasonable for him to send his classmate home.”
##### Dad, Mom, I'm back!

#### Google translate webpage (API is billed and should be used via Google cloud platform)

##### What is there for that silly woman to give, and Zhengpeng is the real one!
##### Alas, old lady, how can you say all those things that are not good for unity! It makes sense for him to send off his classmates!
##### Dad, Mom, I'm back!

In [7]:
#Translations using argos-translate offline model
print(translation_zh_en.translate(sent1))
print(translation_zh_en.translate(sent2))
print(translation_zh_en.translate(sent3))

It was true that the female stereotyped had suffice, and that the perpetrators were.
Alexandre, How you can impose a boycott that is negative. He was sent to his fellows.
raz, I return!


#### Translations using hugging-face translation model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

In [None]:
def translate_hf(chinese_sent):
    batch = tokenizer([chinese_sent], return_tensors="pt")
    generated_ids = model.generate(**batch)
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
print(translate_hf(sent1))
print(translate_hf(sent2))
print(translate_hf(sent3))

### Tokenizing into sentences (Stanza: https://stanfordnlp.github.io/stanza/tokenize.html)

In [None]:
import stanza

In [None]:
stanza.download('zh-hant')

In [None]:
stanza_nlp = stanza.Pipeline(lang='zh-hant', processors='tokenize')
doc = stanza_nlp(sent2)
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')