## Import packages

In [1]:
import sys
import os
import argparse
import logging
import datetime
import codecs
import math
import glob
from pathlib2 import Path
import collections
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
rcParams["figure.figsize"] = [20.0, 10.0]
sns.set(style="ticks", font="IPAexGothic")
from see import see

In [2]:
repo_dir = Path(os.getcwd()).parent
repo_dir

PosixPath('/home/skato/Dropbox/ghq/github.com/sosuke-k/stc3-baseline')

In [3]:
sys.path.append(str(repo_dir))

In [4]:
from data import parse_labels

## Set path variables

In [5]:
raw_train_path = repo_dir / "stc3dataset/data/train_data_en.json"
raw_test_path = repo_dir / "stc3dataset/data/test_data_en.json"

## Load dataset

In [6]:
raw_train = json.load(raw_train_path.open())
raw_test = json.load(raw_test_path.open())
len(raw_train), len(raw_test)

(1672, 390)

## Define parse generator

In [7]:
def data_gen(raw_data):
    for dialogue in raw_data:
        senders = []
        for turn in dialogue["turns"]:
            sender = 1 if turn["sender"].startswith("c") else 0
            senders.append(sender)
        customer_nugget_label, helpdesk_nugget_label, quality_label = \
            parse_labels(dialogue["annotations"], senders)
        yield (dialogue["id"],
               senders,
               customer_nugget_label,
               helpdesk_nugget_label,
               quality_label)

## Iterate and parse

In [8]:
labels = {
    "A": collections.defaultdict(list),
    "E": collections.defaultdict(list),
    "S": collections.defaultdict(list),
}
for train_i, train_sample in enumerate(data_gen(raw_train)):
    if train_i % 1000 == 0:
        print(train_i)
    q_dist_list = train_sample[4]
    for type_i, q_type in enumerate(["A", "E", "S"]):
        for scale_i, scale in enumerate([2, 1, 0, -1, -2]):
            labels[q_type]["p_%d" % scale].append(q_dist_list[type_i][scale_i])
            
adf = pd.DataFrame(labels["A"])
edf = pd.DataFrame(labels["E"])
sdf = pd.DataFrame(labels["S"])
len(adf), len(edf), len(sdf)

0
1000


(1672, 1672, 1672)

## Classify 16 types

In [9]:
def add_flag(_df):
    _df["2>1"] = _df["p_2"] > _df["p_1"]
    _df["1>0"] = _df["p_1"] > _df["p_0"]
    _df["0>-1"] = _df["p_0"] > _df["p_-1"]
    _df["-1>-2"] = _df["p_-1"] > _df["p_-2"]

In [10]:
add_flag(adf)
add_flag(edf)
add_flag(sdf)

## groupby and count

### A score

In [11]:
adf.groupby(["2>1", "1>0", "0>-1", "-1>-2"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p_2,p_1,p_0,p_-1,p_-2
2>1,1>0,0>-1,-1>-2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,False,False,False,58,58,58,58,58
False,False,False,True,111,111,111,111,111
False,False,True,False,542,542,542,542,542
False,False,True,True,545,545,545,545,545
False,True,False,False,10,10,10,10,10
False,True,False,True,41,41,41,41,41
False,True,True,False,107,107,107,107,107
False,True,True,True,45,45,45,45,45
True,False,False,False,2,2,2,2,2
True,False,False,True,10,10,10,10,10


### E score

In [12]:
edf.groupby(["2>1", "1>0", "0>-1", "-1>-2"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p_2,p_1,p_0,p_-1,p_-2
2>1,1>0,0>-1,-1>-2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,False,False,False,86,86,86,86,86
False,False,False,True,86,86,86,86,86
False,False,True,False,480,480,480,480,480
False,False,True,True,208,208,208,208,208
False,True,False,False,79,79,79,79,79
False,True,False,True,131,131,131,131,131
False,True,True,False,308,308,308,308,308
False,True,True,True,158,158,158,158,158
True,False,False,True,1,1,1,1,1
True,False,True,False,46,46,46,46,46


### S score

In [13]:
sdf.groupby(["2>1", "1>0", "0>-1", "-1>-2"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,p_2,p_1,p_0,p_-1,p_-2
2>1,1>0,0>-1,-1>-2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,False,False,False,100,100,100,100,100
False,False,False,True,149,149,149,149,149
False,False,True,False,486,486,486,486,486
False,False,True,True,589,589,589,589,589
False,True,False,False,15,15,15,15,15
False,True,False,True,25,25,25,25,25
False,True,True,False,92,92,92,92,92
False,True,True,True,45,45,45,45,45
True,False,False,False,1,1,1,1,1
True,False,False,True,7,7,7,7,7
