# Initialization and helper functions

In [41]:
import json
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
import numpy as np
from numpy import exp
import collections
from scipy import stats
import math
from statistics import stdev, mean
from fractions import Fraction as fr
import pandas as pd
from statistics import median

%matplotlib inline

INTENTS = ["quirky", "factoid", "music", "query", "set", "remove", "sendemail", "negate", "repeat", "explain"]
NUM_INTENTS = len(INTENTS)
NLU_NAMES = ['watson', 'luis', 'snips', 'rasa-sklearn', 'rasa-diet']
COLORS = ['lightcoral', 'yellowgreen', 'deepskyblue', 'purple', 'mediumturquoise', 'mediumorchid', 'khaki', 'salmon', 'darkturquoise', 'gold']
VERSION = 8
NUM_SPLITS = 10

def normalize(vector):
    normalized_vector = [v/sum(vector) for v in vector]
    return normalized_vector

def load_json(path):
    with open(path) as json_file:
        data = json.load(json_file)
    return data

def get_nlu_result_file(nlu, version, split_num):
    nlu_result_file = '../results/' + nlu + '/v' + str(version) + '/' + nlu + '_split_' + str(split_num) + '_results_v' + str(version) + '.json'
    return nlu_result_file

def get_confidence_score(test, rank=0, nlu="watson", level="rank"):
    """
    the output of this function differs depending on the level.
    On rank level, the output is a single float that represents the confidence score of a rank 
    On model level, the output is a list of floats that includes the confidence scores of all ranks
    """
    if level == "rank":
        predicted_intent_conf = test['intent_ranking'][rank]['confidence'] # rasa
        return predicted_intent_conf 
    
    else: # level == "model"
        predicted_confidences = [rank['confidence'] for rank in test['intent_ranking']]
        return predicted_confidences
    
def get_accuracy(test, rank=0, nlu="watson", level="rank"):
    """
    the output of this function differs depending on the level.
    On rank level, the output is a single int that represents the instance-level accuracy of a rank
    On model level, the output is a list of ints that includes the instance-level accuracies of all ranks
    """
    correct_intent = test['correct_intent']
    if level == "rank":
        predicted_intent = test['intent_ranking'][rank]['name']
        accuracy = int(correct_intent == predicted_intent)
        return accuracy
    else:
        test_accuracies = []
        test_accuracies = [rank['name'] == correct_intent for rank in test['intent_ranking']]
        return test_accuracies
    
def normalize_test(nlu, test, normalized_scores):
    """
    Updates scores in a test with normalized scores
    Input: test (json object) with non-normalized scores + list of normalized scores 
    Output: new test with normalized scores
    """
    for rank, score in enumerate(normalized_scores):
        test['intent_ranking'][rank]['confidence'] = score
    return test

def normalize_data(nlu, data):
    """ 
    This function normalizes all confidence scores in the json file
    Output: list of json objects (treated as list of dictionaries)
    """
    new_data = []
    for test in data:
        try:
            original_scores = get_confidence_score(test, nlu=nlu, level="model")
            normalized_scores = normalize(original_scores)
        except:
            normalized_scores = [0.1]*10
        if test['text'] != '':
            new_test = normalize_test(nlu, test, normalized_scores)
            new_data.append(new_test)
    return new_data

def remove_empty_preds(data):
    new_data = []
    for test in data:
        if test['text'] != '':
            new_data.append(test)
    return new_data

# Non-contextual performance

In [None]:
from sklearn import metrics

In [4]:
all_nlus_f1_scores = []
all_nlus_accuracies = []

for i, nlu in enumerate(NLU_NAMES): 
    nlu_f1s = []
    nlu_accs = []
    for iteration in range(NUM_SPLITS):
        split_num = iteration + 1
        # load nlu results
        nlu_result_file = get_nlu_result_file(nlu, VERSION, split_num)
        data = remove_empty_preds(load_json(nlu_result_file))

        predicted_intents = []
        actual_intents = []
        for test in data:
            predicted = test['intent_ranking'][0]['name']
            actual = test['correct_intent']

            predicted_intents.append(predicted)
            actual_intents.append(actual)


        f1_score = metrics.f1_score(actual_intents, predicted_intents, average='macro')
        accuracy_score = metrics.accuracy_score(actual_intents, predicted_intents)
        nlu_f1s.append(f1_score)
        nlu_accs.append(accuracy_score)
    
    
    averaged_f1s = sum(nlu_f1s) / len(nlu_f1s)
    sd_f1 = stdev(nlu_f1s)
    median_f1 = np.median(nlu_f1s)
    averaged_accs = sum(nlu_accs) / len(nlu_accs)
    sd_acc = stdev(nlu_accs)
    median_acc = np.median(nlu_accs)
    all_nlus_f1_scores.append({'nlu': nlu, 'Mean': round(f1_score, 8), 'Median': round(median_f1, 8), 'standard deviation' : round(sd_f1, 8)})
    all_nlus_accuracies.append({'nlu': nlu, 'Mean': round(accuracy_score, 8), 'Median': round(median_acc, 8), 'standard deviation' : round(sd_acc, 8)})
        
all_nlus_f1_scores

[{'nlu': 'watson',
  'Mean': 0.9214412,
  'Median': 0.9197166,
  'standard deviation': 0.00233822},
 {'nlu': 'luis',
  'Mean': 0.88889647,
  'Median': 0.89300483,
  'standard deviation': 0.00407424},
 {'nlu': 'snips',
  'Mean': 0.89028629,
  'Median': 0.89166467,
  'standard deviation': 0.00217631},
 {'nlu': 'rasa-sklearn',
  'Mean': 0.79020273,
  'Median': 0.79561177,
  'standard deviation': 0.00357872},
 {'nlu': 'rasa-diet',
  'Mean': 0.81890462,
  'Median': 0.81716341,
  'standard deviation': 0.00330622}]

In [5]:
all_nlus_accuracies

[{'nlu': 'watson',
  'Mean': 0.92287342,
  'Median': 0.91996738,
  'standard deviation': 0.00225174},
 {'nlu': 'luis',
  'Mean': 0.88726089,
  'Median': 0.890405,
  'standard deviation': 0.0041371},
 {'nlu': 'snips',
  'Mean': 0.88990639,
  'Median': 0.89059627,
  'standard deviation': 0.00233196},
 {'nlu': 'rasa-sklearn',
  'Mean': 0.87263479,
  'Median': 0.87866153,
  'standard deviation': 0.00385567},
 {'nlu': 'rasa-diet',
  'Mean': 0.90376399,
  'Median': 0.89972545,
  'standard deviation': 0.00385903}]

# Contextual performance

In [None]:
import random

CONTEXT_STRENGTH = 0.8
NUM_SAMPLES = 1000

def random_context():
    x = [random.uniform(1 - CONTEXT_STRENGTH, 1 + CONTEXT_STRENGTH) for _n in range(NUM_INTENTS)]
    return normalize(x)

def sample_from_context(context):
    try:
        drawn_sample = np.random.multinomial(1, context, size=1)[0]
    except ValueError:
        print('Exception when trying to draw sample from context', context)
        raise
    intent_index = np.where(drawn_sample == 1)[0][0]
    return INTENTS[intent_index]

def select_test(data, correct_intent):
    tests = [
        test
        for test in data
        if test['correct_intent'] == correct_intent
    ]
    return random.choice(tests)

def get_confidence(intent, test):
    for intent_ranking in test['intent_ranking']:
        if intent_ranking['name'] == intent:
            return intent_ranking['confidence']

def predict_intent(data, context, correct_intent):
    test = select_test(data, correct_intent)
    linguistic_prediction = [get_confidence(intent, test) for intent in INTENTS]
    contextualized_prediction = np.multiply(context, linguistic_prediction)
    predicted_intent_index = np.argmax(contextualized_prediction)
    predicted_intent = INTENTS[predicted_intent_index]
    return predicted_intent
    
def measure_performance_for_nlu_and_split(nlu, split_num, contexts, correct_intents):
    nlu_result_file = get_nlu_result_file(nlu, VERSION, split_num)
    data = remove_empty_preds(load_json(nlu_result_file))
    predicted_intents = [
        predict_intent(data, context, correct_intent)
        for context, correct_intent in zip(contexts, correct_intents)
    ]
    return {'accuracy': metrics.accuracy_score(correct_intents, predicted_intents),
            'f1': metrics.f1_score(correct_intents, predicted_intents, average='macro')}

contexts = [random_context() for _ in range(NUM_SAMPLES)]
correct_intents = [sample_from_context(context) for context in contexts]

performances_per_nlu = {
    nlu: [
        measure_performance_for_nlu_and_split(nlu, split_num, contexts, correct_intents)
        for split_num in range(1, NUM_SPLITS + 1)
    ]
    for nlu in NLU_NAMES
}

In [None]:
accuracy_per_nlu = {
    nlu: mean([
        split_performance['accuracy']
        for split_performance in performances_per_nlu[nlu]
    ])
    for nlu in NLU_NAMES
}

accuracy_per_nlu

In [None]:
f1_score_per_nlu = {
    nlu: mean([
        split_performance['f1']
        for split_performance in performances_per_nlu[nlu]
    ])
    for nlu in NLU_NAMES
}

f1_score_per_nlu