In [6]:
import pandas as pd
from rouge_score import rouge_scorer
from tqdm.autonotebook import tqdm
from collections import defaultdict
import sys
sys.path.append('..')
from config import SUNIL_TASKS

In [7]:
agg_stats = {}
exact_matches_rouge = defaultdict(list)
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
for t in tqdm(SUNIL_TASKS):
    print(t)
    df = pd.read_json(f"../training_data/{t}")
    df_aug = pd.read_json(f"../training_data_generated/{t}")
    if len(df_aug) == 0:
      continue

    # compute average rouge 
    avg_rouge = 0
    for aug_q in df_aug['inputs']:
      score_aug_sum = 0
      for q in df['inputs']:  
        if scorer.score(aug_q, q)["rougeL"].fmeasure == 1:
          exact_matches_rouge[t].append((aug_q, q))
        score_aug_sum += scorer.score(aug_q, q)["rougeL"].fmeasure
      score_aug_sum /= len(df['inputs']) 
      avg_rouge += score_aug_sum
    avg_rouge /= len(df_aug['inputs'])
    # compute average rouge wrt self
    avg_rouge_self = 0
    for i, aug_q in enumerate(df_aug['inputs']):
      score_aug_sum = 0
      for j, q in enumerate(df_aug['inputs']):  
        if i != j:
          score_aug_sum += scorer.score(aug_q, q)["rougeL"].fmeasure
      score_aug_sum /= (len(df_aug['inputs']) - 1) 
      avg_rouge_self += score_aug_sum
    avg_rouge_self /= len(df_aug['inputs'])

    # compute average length of generation vs true
    avg_len = df['inputs'].apply(len).mean()
    avg_len_aug = df_aug['inputs'].apply(len).mean()

    # label balance distribution (aug data)

    agg_stats[t] = {'avg_rouge': avg_rouge,'avg_rouge_self': avg_rouge_self, 
                    'avg_len': avg_len, 'avg_len_aug': avg_len_aug}

agg_stats


  2%|▏         | 1/48 [00:00<00:04,  9.77it/s]

kanji_ascii.json
known_unknowns.json


  4%|▍         | 2/48 [00:00<00:20,  2.23it/s]

logic_grid_puzzle.json


  6%|▋         | 3/48 [01:38<33:28, 44.63s/it]

logical_args.json


  8%|▊         | 4/48 [02:06<28:00, 38.19s/it]

logical_deduction.json


 10%|█         | 5/48 [02:14<19:43, 27.53s/it]

logical_fallacy_detection.json


 12%|█▎        | 6/48 [02:15<12:57, 18.52s/it]

minute_mysteries_qa.json
misconceptions.json


 17%|█▋        | 8/48 [02:16<06:23,  9.60s/it]

misconceptions_russian.json
moral_permissibility.json


 21%|██        | 10/48 [02:24<04:32,  7.17s/it]

navigate.json


 23%|██▎       | 11/48 [02:26<03:47,  6.16s/it]

nonsense_words_grammar.json


 25%|██▌       | 12/48 [02:27<02:53,  4.82s/it]

novel_concepts.json


 27%|██▋       | 13/48 [02:29<02:24,  4.12s/it]

odd_one_out.json


 29%|██▉       | 14/48 [02:30<01:45,  3.10s/it]

parsinlu_qa.json
penguins_in_a_table.json


 33%|███▎      | 16/48 [02:37<01:48,  3.40s/it]

periodic_elements.json


 35%|███▌      | 17/48 [02:38<01:23,  2.68s/it]

persian_idioms.json


 38%|███▊      | 18/48 [02:40<01:16,  2.53s/it]

phrase_relatedness.json


 40%|███▉      | 19/48 [02:41<01:05,  2.25s/it]

physics.json


 42%|████▏     | 20/48 [02:51<01:59,  4.26s/it]

play_dialog_same_or_different.json


 44%|████▍     | 21/48 [03:46<08:26, 18.77s/it]

presuppositions_as_nli.json


 46%|████▌     | 22/48 [03:57<07:06, 16.39s/it]

question_selection.json


 48%|████▊     | 23/48 [04:54<11:44, 28.18s/it]

real_or_fake_text.json


 50%|█████     | 24/48 [05:15<10:26, 26.11s/it]

reasoning_about_colored_objects.json


 52%|█████▏    | 25/48 [05:17<07:20, 19.15s/it]

rhyming.json


 54%|█████▍    | 26/48 [05:19<05:03, 13.78s/it]

riddle_sense.json


 56%|█████▋    | 27/48 [05:20<03:35, 10.26s/it]

ruin_names.json


 58%|█████▊    | 28/48 [05:23<02:39,  7.96s/it]

salient_translation_error_detection.json


 60%|██████    | 29/48 [05:56<04:54, 15.48s/it]

similarities_abstraction.json


 62%|██████▎   | 30/48 [05:57<03:18, 11.04s/it]

simple_ethical_questions.json


 65%|██████▍   | 31/48 [06:05<02:50, 10.05s/it]

snarks.json


 67%|██████▋   | 32/48 [06:06<02:01,  7.56s/it]

social_iqa.json


 69%|██████▉   | 33/48 [06:09<01:30,  6.03s/it]

social_support.json


 71%|███████   | 34/48 [06:10<01:05,  4.70s/it]

sports_understanding.json


 73%|███████▎  | 35/48 [06:11<00:46,  3.59s/it]

strange_stories.json


 75%|███████▌  | 36/48 [06:23<01:13,  6.09s/it]

strategyqa.json


 77%|███████▋  | 37/48 [06:24<00:47,  4.35s/it]

suicide_risk.json


 79%|███████▉  | 38/48 [06:34<01:00,  6.06s/it]

swahili_english_proverbs.json


 81%|████████▏ | 39/48 [06:38<00:49,  5.52s/it]

symbol_interpretation.json


 83%|████████▎ | 40/48 [06:55<01:12,  9.03s/it]

temporal_sequences.json


 85%|████████▌ | 41/48 [07:19<01:33, 13.35s/it]

timedial.json


 88%|████████▊ | 42/48 [07:46<01:45, 17.58s/it]

tracking_shuffled_objects.json


 90%|████████▉ | 43/48 [08:03<01:27, 17.48s/it]

understanding_fables.json


 92%|█████████▏| 44/48 [08:51<01:46, 26.64s/it]

unit_conversion.json


 94%|█████████▍| 45/48 [08:52<00:56, 18.78s/it]

vitaminc_fact_verification.json


 96%|█████████▌| 46/48 [09:02<00:32, 16.18s/it]

what_is_the_tao.json


100%|██████████| 48/48 [09:07<00:00, 11.40s/it]

which_wiki_edit.json





{'kanji_ascii.json': {'avg_rouge': 0.46241199770809127,
  'avg_rouge_self': 0.37796599285892596,
  'avg_len': 278.24,
  'avg_len_aug': 315.6},
 'known_unknowns.json': {'avg_rouge': 0.3639358248674737,
  'avg_rouge_self': 0.47828546536644295,
  'avg_len': 102.06,
  'avg_len_aug': 99.20454545454545},
 'logic_grid_puzzle.json': {'avg_rouge': 0.4088896379472601,
  'avg_rouge_self': 0.518364255780056,
  'avg_len': 1543.2,
  'avg_len_aug': 1110.3191489361702},
 'logical_args.json': {'avg_rouge': 0.24279109716467664,
  'avg_rouge_self': 0.43557648860525927,
  'avg_len': 859.66,
  'avg_len_aug': 740.9791666666666},
 'logical_deduction.json': {'avg_rouge': 0.47715326221313453,
  'avg_rouge_self': 0.5084851761177126,
  'avg_len': 414.26,
  'avg_len_aug': 422.609756097561},
 'logical_fallacy_detection.json': {'avg_rouge': 0.45711336533623925,
  'avg_rouge_self': 0.5261453392050055,
  'avg_len': 203.28,
  'avg_len_aug': 178.13793103448276},
 'misconceptions.json': {'avg_rouge': 0.6309821347338199,

"ROUGE-L is based on the longest common subsequence (LCS) between our model output and reference, i.e. the longest sequence of words (not necessarily consecutive, but still in order) that is shared between both. A longer shared sequence should indicate more similarity between the two sequences." 

Thus ROUGE-L not a great metric for comparing similarity for tasks defined by specific character set (e.g., kanjii ascii), because it does not consider the ordering of text.

In [20]:
print(exact_matches_rouge.keys())

len(exact_matches_rouge['parsinlu_qa.json'])

dict_keys(['kanji_ascii.json', 'parsinlu_qa.json'])


246

In [17]:
# agg_stats
print(len(exact_matches_rouge['kanji_ascii.json']))
aug_q, q = exact_matches_rouge['kanji_ascii.json'][0]
print(aug_q)
print('-'*50)
print(q)

246
Q: ............#..
#############..
#..##.#..#..#..
#..#..##.#..#..
#..#...#.#.##..
#.##.#########.
#.##.........#.
#..#.#########.
#..#.........#.
#..###########.
##.##....#.....
####.#.#.##.##.
#....#.#..#..##
#...##.#...##.#
#...#..#####...
A:
--------------------------------------------------
Q: ..#......#.....
..#.....##.....
..#############
.#..#..##.##...
##..##.#...#...
#....#.#...#...
.......#.......
.#############.
.##....#....##.
.##....#....##.
.#############.
.##....#....##.
.##....#....##.
.#############.
..#.........#..
A:


In [23]:
for (aug_q, q) in exact_matches_rouge['kanji_ascii.json']:
    print(aug_q)
    print('-'*20)

Q: ............#..
#############..
#..##.#..#..#..
#..#..##.#..#..
#..#...#.#.##..
#.##.#########.
#.##.........#.
#..#.#########.
#..#.........#.
#..###########.
##.##....#.....
####.#.#.##.##.
#....#.#..#..##
#...##.#...##.#
#...#..#####...
A:
--------------------
Q: ............#..
#############..
#..##.#..#..#..
#..#..##.#..#..
#..#...#.#.##..
#.##.#########.
#.##.........#.
#..#.#########.
#..#.........#.
#..###########.
##.##....#.....
####.#.#.##.##.
#....#.#..#..##
#...##.#...##.#
#...#..#####...
A:
--------------------
Q: ............#..
#############..
#..##.#..#..#..
#..#..##.#..#..
#..#...#.#.##..
#.##.#########.
#.##.........#.
#..#.#########.
#..#.........#.
#..###########.
##.##....#.....
####.#.#.##.##.
#....#.#..#..##
#...##.#...##.#
#...#..#####...
A:
--------------------
Q: ............#..
#############..
#..##.#..#..#..
#..#..##.#..#..
#..#...#.#.##..
#.##.#########.
#.##.........#.
#..#.#########.
#..#.........#.
#..###########.
##.##....#.....
####.#.#.##.##.
#...

In [22]:
for (aug_q, q) in exact_matches_rouge['parsinlu_qa.json']:
    print(aug_q)
    print('-'*20)

Q: چه کسی موسس شرکت اپل است؟
  choice: بیل گیتس
  choice: استیو جابز
  choice: مارک زاکربرگ
  choice: لری پیج
A:
--------------------
Q: چه کسی موسس شرکت اپل است؟
  choice: بیل گیتس
  choice: استیو جابز
  choice: مارک زاکربرگ
  choice: لری پیج
A:
--------------------
Q: چه کسی موسس شرکت اپل است؟
  choice: بیل گیتس
  choice: استیو جابز
  choice: مارک زاکربرگ
  choice: لری پیج
A:
--------------------
Q: چه کسی موسس شرکت اپل است؟
  choice: بیل گیتس
  choice: استیو جابز
  choice: مارک زاکربرگ
  choice: لری پیج
A:
--------------------
Q: چه کسی موسس شرکت اپل است؟
  choice: بیل گیتس
  choice: استیو جابز
  choice: مارک زاکربرگ
  choice: لری پیج
A:
--------------------
Q: چه کسی موسس شرکت اپل است؟
  choice: بیل گیتس
  choice: استیو جابز
  choice: مارک زاکربرگ
  choice: لری پیج
A:
--------------------
Q: چه کسی موسس شرکت اپل است؟
  choice: بیل گیتس
  choice: استیو جابز
  choice: مارک زاکربرگ
  choice: لری پیج
A:
--------------------
Q: چه کسی موسس شرکت اپل است؟
  choice: بیل گیتس
  choice: استی

In [24]:
import json
with open('generated_stats.json', 'w') as f:
  json.dump(agg_stats, f)
f.close()

# Finding - chatGPT poor at generating persian qa and kanjii ascii question examples