In [2]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [openpyxl]1/2[0m [openpyxl]
[1A[2KSuccessfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


# Imports

In [3]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score as meteor_score_func
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Evaluate Translation - Code

In [4]:
def calc_meteor_score(references, candidates):
    scores = []
    for i in range(len(references)):
        scores.append(meteor_score_func(references[i], candidates[i]))

    return np.mean(scores)

In [5]:
def bleu_n(references, candidates, n, w=None, s=None):
    scores = []
    acc_for_small = []
    okay = []
    for i in range(len(references)):
        a = references[i]
        b = candidates[i]
        if len(a[0]) < n or len(b) < n:
            okay.append(False)
            acc_for_small.append(' '.join(a[0]) == ' '.join(b))
        else:
            okay.append(True)
            scores.append(sentence_bleu(a, b, weights=w, smoothing_function=s))

    corpus_score = corpus_bleu(pd.Series(references)[okay], pd.Series(candidates)[okay], weights=w, smoothing_function=s)
    return np.mean(scores), np.mean(acc_for_small), corpus_score

In [6]:
def calculate_eval(original, fixed, n_grams=[2, 3, 4, 6], smoother=SmoothingFunction().method4, logging_bleu=False):
    eval_dct = {}
    cols = ['answer_1', 'answer_2', 'answer_3', 'answer_4']

    eval_dct['Semantic maintaining'] = 1 - fixed['Semantic_Inequality'].mean()
    eval_dct['Gender maintaining'] = 1 - fixed['Gender_mismatch'].mean()

    # Exact same questions num
    eval_dct['untouched acc question'] = (original['question'] == fixed['question']).mean()
    # Exact same answers num
    answers_original = pd.Series([i for c in cols for i in original[c].values])
    answers_fixed = pd.Series([i for c in cols for i in fixed[c].values])
    eval_dct['untouched acc answers'] = (answers_original == answers_fixed).mean()

    # Calcualte BLEU score
    bleu_weights = [[1/i] * i for i in n_grams]
    n_gram_str = [f'BLEU-{i}' for i in n_grams]

    # BLUE score on questions
    references_question = [[i.split()] for i in fixed['question'].values]
    candidates_question = [i.split() for i in original['question'].values]
    score = [sentence_bleu(x, y, weights=bleu_weights, smoothing_function=smoother) for x, y in zip(references_question, candidates_question)]
    score = np.array(score).mean(axis=0)
    corpus_score = corpus_bleu(references_question, candidates_question, weights=bleu_weights, smoothing_function=smoother)

    # METEOR score on questions
    meteor_score = [meteor_score_func(x, y) for x, y in zip(references_question, candidates_question)]
    meteor_score = np.array(meteor_score).mean(axis=0)
    
    # BLEU score on answers
    references_answers = [[i.split()] for c in cols for i in fixed[c].values]
    candidates_answers = [i.split() for c in cols for i in original[c].values]
    score_answers = []
    acc_small = []
    corpus_score_answers = []
    for i in range(len(n_grams)):
        output = bleu_n(references_answers, candidates_answers, n_grams[i], bleu_weights[i], smoother)
        score_answers.append(output[0])
        acc_small.append(output[1])
        corpus_score_answers.append(output[2])

    # METEOR score on answers
    meteor_score_answers = calc_meteor_score(references_answers, candidates_answers)

    # Enter results to output
    eval_dct['BLEU'] = {}
    for n, s, sa in zip(n_grams, score, score_answers):
        eval_dct['BLEU'][f'questions_BLEU_{n}'] = s
        eval_dct['BLEU'][f'answers_BLEU_{n}'] = sa
    
    eval_dct['METEOR'] = {}
    eval_dct['METEOR']['questions_METEOR'] = meteor_score
    eval_dct['METEOR']['answers_METEOR'] = meteor_score_answers

    if logging_bleu:
        print(f'Questions: Corpus  [{", ".join(n_gram_str)}] score: {np.round(corpus_score, 4)}')
        print(f'Questions: Average [{", ".join(n_gram_str)}] score: {np.round(score, 4)}')
        print()
        print(f'Answers: Average   [{", ".join(n_gram_str)}] score: {np.round(score_answers, 4)}')
        print(f'Answers: ACC@small [{", ".join(n_gram_str)}] score: {np.round(acc_small, 4)}')
        print(f'Answers: Corpus    [{", ".join(n_gram_str)}] score: {np.round(corpus_score_answers, 4)}')

    return eval_dct

In [7]:
# Function to convert value to color
def color_gradient(val):
    g = int(255 * val ** 2)
    r = 255 - g
    return f'background-color: rgb({r}, {g}, 0)'


def show_eval_dct(eval_dct):
    # Create the DataFrame to show:
    a = pd.DataFrame(eval_dct, index=['value']).drop(['BLEU', 'METEOR'], axis=1)
    b = pd.DataFrame(eval_dct['BLEU'], index=['value'])
    c = pd.DataFrame(eval_dct['METEOR'], index=['value'])
    d = pd.concat([a, b, c], axis=1)
    
    # Apply the color styling
    styled_df = d.style.format('{:.3f}').map(color_gradient)
    display(styled_df)


def print_eval_dct(eval_dct):
    for k in eval_dct:
        if type(eval_dct[k]) is not dict:
            print(f'{k:26} | {eval_dct[k]:7.3f}')
        if k == 'BLEU':
            print('BLEU scores:')
            for bk in eval_dct[k]:
                print(f'\t{bk:18} | {eval_dct[k][bk]:7.3f}')
        if k == 'METEOR':
            print('METEOR scores:')
            for bk in eval_dct[k]:
                print(f'\t{bk:18} | {eval_dct[k][bk]:7.3f}')

In [8]:
def get_where_different(original, fixed):
    # + 5 - the first five rows are the few-shots exampels
    # + 2 - the first row in the excel file is the columns' names, and the index starts with 1
    question_different = original.index[original['question'] != fixed['question']] + 2 + 5

    change_set = set()
    change_set |= set(original.index[original['answer_1'] != fixed['answer_1']] + 2 + 5)
    change_set |= set(original.index[original['answer_2'] != fixed['answer_2']] + 2 + 5)
    change_set |= set(original.index[original['answer_3'] != fixed['answer_3']] + 2 + 5)
    change_set |= set(original.index[original['answer_4'] != fixed['answer_4']] + 2 + 5)
    
    change_set = list(change_set)
    change_set = sorted(change_set)
    answers_different = change_set

    output = {
        'questions': question_different,
        'answers': answers_different,
    }
    return output

# Evaluate Translation - Claude

In [17]:
original = pd.read_excel('translate_eval/claude_arc_ai2_chall_train_subsample - Original.xlsx', index_col='Unnamed: 0')
fixed    = pd.read_excel('translate_eval/claude_arc_ai2_chall_train_subsample - fixed.xlsx', index_col='Unnamed: 0')
# First five exmaples are from the few-shots in the prompt, remove them
original = original.iloc[5:].reset_index(drop=True)
fixed = fixed.iloc[5:].reset_index(drop=True)

In [18]:
dif = get_where_different(original, fixed)
dif

{'questions': Index([ 7, 11, 17, 22, 27, 28, 35, 38, 39, 41, 43, 44, 48, 51, 54, 55, 56, 60,
        63, 66, 69, 71, 73, 74, 75, 77, 78, 80, 81],
       dtype='int64'),
 'answers': [14, 17, 20, 26, 27, 33, 54, 57, 64, 78, 81]}

In [19]:
%%time
eval_dct = calculate_eval(original, fixed)
show_eval_dct(eval_dct)
print_eval_dct(eval_dct)

Unnamed: 0,Semantic maintaining,Gender maintaining,untouched acc question,untouched acc answers,questions_BLEU_2,answers_BLEU_2,questions_BLEU_3,answers_BLEU_3,questions_BLEU_4,answers_BLEU_4,questions_BLEU_6,answers_BLEU_6,questions_METEOR,answers_METEOR
value,0.973,0.907,0.613,0.94,0.926,0.971,0.906,0.951,0.887,0.928,0.843,0.904,0.949,0.873


Semantic maintaining       |   0.973
Gender maintaining         |   0.907
untouched acc question     |   0.613
untouched acc answers      |   0.940
BLEU scores:
	questions_BLEU_2   |   0.926
	answers_BLEU_2     |   0.971
	questions_BLEU_3   |   0.906
	answers_BLEU_3     |   0.951
	questions_BLEU_4   |   0.887
	answers_BLEU_4     |   0.928
	questions_BLEU_6   |   0.843
	answers_BLEU_6     |   0.904
METEOR scores:
	questions_METEOR   |   0.949
	answers_METEOR     |   0.873
CPU times: user 118 ms, sys: 3.08 ms, total: 121 ms
Wall time: 120 ms


# Evaluate Translation - dictalm

In [12]:
original = pd.read_excel('translate_eval/dicta_arc_ai2_challenge_train_subsample - Original.xlsx', index_col='Unnamed: 0')
fixed    = pd.read_excel('translate_eval/dicta_arc_ai2_challenge_train_subsample - fixed.xlsx', index_col='Unnamed: 0')
# First five exmaples are from the few-shots in the prompt, remove them
original = original.iloc[5:].reset_index(drop=True)
fixed = fixed.iloc[5:].reset_index(drop=True)

In [13]:
dif = get_where_different(original, fixed)
dif

{'questions': Index([12, 19, 21, 22, 25, 26, 27, 29, 38, 41, 42, 43, 48, 51, 63, 69, 75, 77,
        78, 81],
       dtype='int64'),
 'answers': [7,
  8,
  18,
  26,
  41,
  45,
  50,
  51,
  56,
  59,
  63,
  66,
  67,
  70,
  71,
  72,
  78,
  81]}

In [15]:
%%time
eval_dct = calculate_eval(original, fixed)
show_eval_dct(eval_dct)
print_eval_dct(eval_dct)

Unnamed: 0,Semantic maintaining,Gender maintaining,untouched acc question,untouched acc answers,questions_BLEU_2,answers_BLEU_2,questions_BLEU_3,answers_BLEU_3,questions_BLEU_4,answers_BLEU_4,questions_BLEU_6,answers_BLEU_6,questions_METEOR,answers_METEOR
value,0.84,0.92,0.733,0.883,0.955,0.92,0.946,0.915,0.936,0.873,0.91,0.863,0.968,0.83


Semantic maintaining       |   0.840
Gender maintaining         |   0.920
untouched acc question     |   0.733
untouched acc answers      |   0.883
BLEU scores:
	questions_BLEU_2   |   0.955
	answers_BLEU_2     |   0.920
	questions_BLEU_3   |   0.946
	answers_BLEU_3     |   0.915
	questions_BLEU_4   |   0.936
	answers_BLEU_4     |   0.873
	questions_BLEU_6   |   0.910
	answers_BLEU_6     |   0.863
METEOR scores:
	questions_METEOR   |   0.968
	answers_METEOR     |   0.830
CPU times: user 128 ms, sys: 3.51 ms, total: 131 ms
Wall time: 131 ms
