In [None]:
import argparse
import datetime
import logging
import random
import pprint
pp = pprint.PrettyPrinter(indent=2)

import pandas as pd
import numpy as np
import sys
import xlsxwriter

In [None]:
evaluation_file = "<PROJECT_DIR>/08_propagation_evaluation/<DATE>/main.txt"

In [None]:
df = pd.read_csv(
    evaluation_file,
    sep=" ",
    index_col=0)

if len(df[(df.y_train == -1) & (df.y_test == -1) & (df.y_pred == -1)]) != 0:
    print("WARNING")
else:
    print("All right")

df_evaluation = df.sort_index()

df_evaluation = df_evaluation[["y_train", "y_pred", "y_conf"]]
df_evaluation.head()
print(df_evaluation.shape)
df_evaluation.head()

In [None]:
train_counts = df_evaluation.y_train[df_evaluation.y_train != -1].value_counts().to_frame()
pred_counts = df_evaluation.y_pred.value_counts().to_frame()
counts = train_counts.join(pred_counts)
print(counts.shape)
counts.head()

In [None]:
low_quantil = 0.10
mid_quantilA = 0.45
mid_quantilB = 0.55
high_quantil = 0.90

# counts.y_train[y_train_low]
y_train_low = counts.y_train <= counts.y_train.quantile(q=low_quantil)
print("y_train_low", np.sum(y_train_low))
y_train_mid = (counts.y_train >= counts.y_train.quantile(q=mid_quantilA)) & (counts.y_train <= counts.y_train.quantile(q=mid_quantilB))
print("y_train_mid", np.sum(y_train_mid))
y_train_high = counts.y_train >= counts.y_train.quantile(q=high_quantil)
print("y_train_high", np.sum(y_train_high))

print()

y_pred_low = counts.y_pred <= counts.y_pred.quantile(q=low_quantil)
print("y_pred_low", np.sum(y_pred_low))
y_pred_mid = (counts.y_pred >= counts.y_pred.quantile(q=mid_quantilA)) & (counts.y_pred <= counts.y_pred.quantile(q=mid_quantilB))
print("y_pred_mid", np.sum(y_pred_mid))
y_pred_high = counts.y_pred >= counts.y_pred.quantile(q=high_quantil)
print("y_pred_high", np.sum(y_pred_high))

In [None]:
# Partition the synsets into the nine groups (where a synset can appear in multiple groups)
# y_train, y_pred

# group1: low, low
g1_mask = pd.concat((y_train_low, y_pred_low), axis=1)
g1 = counts[g1_mask].dropna()
print(g1.shape)

# group2: low, mid
g2_mask = pd.concat((y_train_low, y_pred_mid), axis=1)
g2 = counts[g2_mask].dropna()
print(g2.shape)

# group3: low, high
g3_mask = pd.concat((y_train_low, y_pred_high), axis=1)
g3 = counts[g3_mask].dropna()
print(g3.shape)

# group4: mid, low
g4_mask = pd.concat((y_train_mid, y_pred_low), axis=1)
g4 = counts[g4_mask].dropna()
print(g4.shape)

# group5: mid, mid
g5_mask = pd.concat((y_train_mid, y_pred_mid), axis=1)
g5 = counts[g5_mask].dropna()
print(g5.shape)

# group6: mid, high
g6_mask = pd.concat((y_train_mid, y_pred_high), axis=1)
g6 = counts[g6_mask].dropna()
print(g6.shape)

# group7: high, low
g7_mask = pd.concat((y_train_high, y_pred_low), axis=1)
g7 = counts[g7_mask].dropna()
print(g7.shape)

# group8: high, mid
g8_mask = pd.concat((y_train_high, y_pred_mid), axis=1)
g8 = counts[g8_mask].dropna()
print(g8.shape)

# group9: high, high
g9_mask = pd.concat((y_train_high, y_pred_high), axis=1)
g9 = counts[g9_mask].dropna()
print(g9.shape)

In [None]:
# Take three synsets per groups
# If a synset was already taken by another group, repeat process
num_try = 0
while True:
    print("Try:", num_try)
    samples = [group.sample(n=3) 
               for group in [g1, g2, g3, g4, g5, g6, g7, g8, g9]]
    pp.pprint([sample.index.tolist() for sample in samples])

    # check if every synset only occurrs one time
    flat_list = [item for sample in samples for item in sample.index.tolist()]
    if len(set(flat_list)) != 27:
        print("A synset occurred multiple times! ",len(set(flat_list)),", Trying again...")
        num_try += 1
    else: 
        print("Found distinct samples")
        pp.pprint([sample.index.tolist() for sample in samples])
        break

In [None]:
def create_cell(_df, _synset_row, _conf_order):
    training = df_evaluation[df_evaluation.y_train == _synset_row.Index].index.tolist()
    s = df_evaluation[(df_evaluation.y_train != df_evaluation.y_pred) & (df_evaluation.y_pred == _synset_row.Index)][["y_conf"]]
    s = s.reset_index()
    
    if _conf_order == "high_first":
        s = s.sort_values(by="y_conf", ascending=False)
    elif _conf_order == "low_first":
        s = s.sort_values(by="y_conf", ascending=True)
    elif _conf_order == "random":
        s = s.sample(frac=1)
    else:
        raise NotImplementedError
    
    s = s.head(10)
    suggestions = [{"word": r.word, "conf": r.y_conf} for r in s.itertuples()]

    return {
        "synset": _synset_row.Index,
        "training": training,
        "suggestions": suggestions,
        "remark": "conf_order: {}, y_train: {}, y_pred: {}".format(_conf_order, _synset_row.y_train, _synset_row.y_pred)
    }

cells = [] # to display in Excel
conf_orders = ["high_first", "low_first", "random"]

# Sort the suggestions wihin the three synsets per group according to confidence
for sample in samples:
    for synset_row, conf_order in zip(sample.itertuples(), conf_orders):
        cells.append(create_cell(df_evaluation, synset_row, conf_order))
    
pp.pprint(cells[0])

In [None]:
# DONE: Write into an Excel file
eval_scoring_file = "thesaurus_suggestor_" + datetime.datetime.today().strftime(
    '%Y%m%d-%H%M%S') + ".xlsx"
print("Evaluation Scoring File: ", eval_scoring_file)

workbook = xlsxwriter.Workbook(eval_scoring_file)
worksheet = workbook.add_worksheet()

italic = workbook.add_format({'italic': True})
bold = workbook.add_format({'bold': True})
gray_bg = workbook.add_format({'bg_color': '#D8D8D8'})
two_dec_dig = workbook.add_format()
two_dec_dig.set_num_format('0.00')

worksheet.write('A3', 'generation time')
worksheet.write('B3', datetime.datetime.today().strftime('%Y%m%d-%H%M%S'))
worksheet.write('A4', 'evaluation_file')
worksheet.write('B4', evaluation_file)

row = 6
col = 1

worksheet.write(row, col, "Existing Synset", bold)
worksheet.set_column(col, col,  22)
worksheet.write(row, col + 3, "Suggestion", bold)
worksheet.set_column(col + 3, col + 3,  22)
worksheet.write(row, col + 4, "Confidence", bold)
worksheet.set_column(col + 4, col + 4,  10)
worksheet.write(row, col + 5, "Score (worst: 0, best: 2)", bold)

row = row + 1

for idx, case in enumerate(cells):
    row_start = row

    worksheet.write(row, col-1, case["synset"])
    worksheet.write_comment(row, col-1, "g{}, ".format((idx//3)+1)+case["remark"])

    for word in case["training"]:
        worksheet.write(row, col, word)
        row = row + 1
        
    row = row_start

    for rank, suggestion in enumerate(case["suggestions"], start=1):
        worksheet.write(row, col + 2, rank)

        worksheet.write(row, col + 3, suggestion["word"])

        worksheet.write(row, col + 4, suggestion["conf"], two_dec_dig)
        worksheet.write(row, col + 5, '', gray_bg)
        row = row + 1

    row = row + 2

workbook.close()