In [7]:
from pathlib import Path

import pandas as pd

from daseg import DialogActCorpus
from daseg.data import to_transformers_ner_dataset

%load_ext autoreload
%autoreload 2

In [None]:
dataset = DialogActCorpus.from_swda_path('deps/swda/swda')

In [None]:
dataset.calls[182].render();

In [None]:
call_ids = dataset.call_ids
calls = dataset.calls
call = dataset.calls[0]
call[:10]

In [None]:
texts_by_act = dataset.acts_with_examples()

In [None]:
len(texts_by_act.keys()), texts_by_act.keys()

In [None]:
acts = set(texts_by_act.keys())

In [None]:
original_acts = set(Path('/Users/pzelasko/jhu/da/swda-dialog-act-list').read_text().split('\n')[:-1])  # empty line

In [None]:
len(original_acts)

In [None]:
acts - original_acts

In [None]:
len(acts)

In [None]:
original_acts - acts

In [None]:
len(original_acts & acts)

In [None]:
pd.Series({act: len(texts) for act, texts in texts_by_act.items()}).sort_values().plot.barh(figsize=(10, 12), logx=True)

In [None]:
texts_by_act['Hedge']

## Number of turns distribution

In [None]:
sum(map(len, texts_by_act.values()))

In [None]:
pd.Series([len(call) for call in calls]).hist()

## Word length distribution

In [None]:
special_symbols = dataset.special_symbols()
len(special_symbols)

In [None]:
words_len_dist = pd.Series([sum(len(u.split()) for u, _, _, _ in call) for call in calls])

In [None]:
words_len_dist.hist()

In [None]:
to_transformers_ner_dataset(calls[1073], special_symbols)[:20]

In [None]:
if False:
    for split_name, split_dataset in dataset.train_dev_test_split().items():
        split_dataset.dump_for_transformers_ner(f'deps/transformers/examples/ner/{split_name}.txt.tmp')
else:
    print("DATASETS NOT WRITTEN TO DISK")

# Visualize

In [None]:
call[:20]

In [None]:
call.render(max_turns=20)

# Train the model / Predict

Refer to `run_da.sh` for this purpose. 

# Read model predictions

In [None]:
#preds_path = '/home/pzelasko/transformers/examples/ner/swda-xlmroberta-kosher-split-t43/test_predictions.txt'
preds_path = '/home/pzelasko/daseg/deps/transformers/examples/ner/xlnet-v1/test_predictions.txt'
calls = SwdaDataset.from_transformers_predictions(preds_path)

## Render model predictions

In [None]:
idx = 7

In [None]:
calls.calls[idx].render(max_turns=None)

# Inference

In [None]:
eval_dset = dataset.train_dev_test_split()['test']

In [1]:
#model_dir = 'deps/transformers/examples/ner/xlnet-v1/'
#model_dir = '/Users/pzelasko/jhu/da/xlnet-v1/'
#model_dir = '/Users/pzelasko/jhu/da/xlnet-t46-textnorm/'
#model_dir = '/Users/pzelasko/jhu/da/longformer-t42-submission'
model_dir = '/Users/pzelasko/jhu/da/09-01-2020-tacl-submission-models/longformer_swda_dialog_nolower_basic_42/checkpointepoch=8.ckpt'

In [2]:
from daseg import TransformerModel
model = TransformerModel.from_path(model_dir)

  from tqdm.autonotebook import tqdm


In [None]:
eval_dset_vl = None

In [3]:
vl_root = '/Users/pzelasko/jhu/voicelab/bncdata/'

In [11]:
texts = {p.name: p.read_text().splitlines() for p in Path(vl_root).glob('*.txt')}
    

In [17]:
next(iter(texts.items()))

('20200808-013921_7048451178-all.txt',
 ["hello. hello. hello good morning my name is Marzena and I'm calling from milano company. I am contacting because our company is _y new on the market and we want to promote ourselves by sending you free package consisting of a razor one pair of black pressure free socks and one pack_trunc for if you're washing washing dishes. so three products very good quality you have for free from us. would you like to receive that? yeah sure. okay would like to give me your address details and I will send the package to you. address? { _y; yes} post office box three six seven. clarks beach. mhm. clarks beach. okay {and; and} what is _unclear. A O A one W O. one W O and the _y street's name please. street _y we don't need street don't used it. so just post office box three six seven. yeah clarks {beach; okay}. A O A one W O Newfoundland and Labrador N F. okay and your name is? Glen _unclear. perfect for number to contact you is the one I am calling right now?

In [10]:
from daseg import *

In [23]:
eval_dset_vl = DialogActCorpus(dialogues = {
    key: Call([
        FunctionalSegment(
            text='<TURN>'.join(text),
            dialog_act='O',
            speaker='A',
        )
    ])
    for key, text in texts.items()
})

In [24]:
results = model.predict(eval_dset_vl, batch_size=1)

In [25]:
results.keys()

dict_keys(['losses', 'loss', 'predictions', 'logits', 'true_labels', 'true_dataset', 'sklearn_metrics', 'seqeval_metrics', 'ORIGINAL_zhao_kawahara_metrics', 'dataset', 'zhao_kawahara_metrics'])

In [27]:
pred_corp = results['dataset']

In [28]:
pred_corp.calls[0].render()

[None]

In [30]:
outdir = Path('/Users/pzelasko/jhu/voicelab/bncdata-pred-swda-lformer-upper')
outdir.mkdir(exist_ok=True, parents=True)
for cid, call in pred_corp.dialogues.items():
    contents = '\n'.join(
        f'{fs.text}\t{fs.dialog_act}' for fs in call
    )
    (outdir / cid).with_suffix('.txt').write_text(contents)

In [None]:
results['sklearn_metrics']

In [None]:
results['ORIGINAL_zhao_kawahara_metrics']

In [None]:
for x in 'accuracy f1 precision recall'.split():
    print(results[x])

from seqeval.metrics import classification_report
print(classification_report(results['true_labels'], results['predictions']))

In [None]:
results['dataset'].calls[0].render()