In [1]:
import json
import glob
from tqdm.auto import tqdm
import pandas as pd
import random
import itertools
from torch.utils.data import Dataset
import os

In [2]:
train_all_questions = []
for path in tqdm(glob.glob("/net/acadia4a/data/zkhan/gqa/train_all_questions/*.json")):
    with open(path, 'r') as f:
        train_shard = json.load(f)
        questions_for_shard = [
            {"question_id": k, 
             "question_type": v["types"]["detailed"],
             "image_id": v["imageId"],
             "label": v["answer"],
             "question": v["question"]
            }
            for k,v in train_shard.items()
        ]
    train_all_questions.extend(questions_for_shard)

  0%|          | 0/10 [00:00<?, ?it/s]

In [3]:
def read_json(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data


In [4]:
# question type in test set
test_qtype = set([row['question_type'] for row in read_json('/net/acadia7a/data/fkee/nec-learning-agent-cache/42_evaluate_llama318b_fromepoch3/records.jsonl')])

In [5]:
# exisiting_qid
exisiting_qid_file = read_json('/net/acadia7a/data/fkee/learning_agent/f_script/exisitng_qid.jsonl')

In [6]:
exisiting_qid = [row["question_qid"] for row in exisiting_qid_file if row["file_type"]=='all_files'][0]
exisiting_qid

['16218389',
 '15670264',
 '08755604',
 '181066751',
 '11223529',
 '0165933',
 '17725160',
 '1660435',
 '04209634',
 '13684462',
 '01112117',
 '02952749',
 '19866636',
 '03685711',
 '03905466',
 '16393632',
 '14645779',
 '03831804',
 '14228864',
 '13252152',
 '11636737',
 '0313239',
 '0390333',
 '16762559',
 '16156684',
 '11280880',
 '08581894',
 '0195951',
 '15311493',
 '05447785',
 '01834250',
 '0646153',
 '12430977',
 '16434382',
 '011011111',
 '06848356',
 '02181199',
 '05823581',
 '08913252',
 '1812865',
 '04954306',
 '04245809',
 '14697661',
 '0057582',
 '19137837',
 '05238288',
 '111058007',
 '13200784',
 '05371034',
 '06536956',
 '04612280',
 '12309185',
 '06684951',
 '19163099',
 '15797681',
 '18649238',
 '07753271',
 '08480163',
 '09118265',
 '02108766',
 '12646614',
 '06516274',
 '0122169',
 '11285701',
 '09129346',
 '08276153',
 '17662534',
 '11914868',
 '13913708',
 '02851867',
 '1696351',
 '17347030',
 '02255809',
 '00166820',
 '0477849',
 '03735396',
 '09688396',
 '08957

In [7]:
train_all_questions_df = pd.DataFrame(train_all_questions)

In [8]:
train_all_questions_df.shape

(14305356, 5)

In [9]:
filter_train_questions_df = train_all_questions_df[
    (~train_all_questions_df["question_id"].isin(exisiting_qid)) & 
    (train_all_questions_df["question_type"].isin(test_qtype))
]

In [10]:
filter_train_questions_df.shape

(14288706, 5)

In [11]:
qtype_survey = filter_train_questions_df['question_type'].value_counts().to_frame().reset_index()

In [12]:
qtype_survey

Unnamed: 0,question_type,count
0,existC,1292303
1,existOrC,1210754
2,existRelS,866026
3,existRelSC,761033
4,relVerify,572569
...,...,...
97,typeVerifyC,705
98,dir,256
99,company,244
100,companyVerify,104


In [16]:
read_acc_df = pd.read_csv('/home/mai/fke/fkee/learning_agent/f_script/42_evaluate_lora64_epoch2_check_qtype.csv')
read_acc_df["need_number"] = (read_acc_df["total_number"] - read_acc_df["correct_number"])*20

In [17]:
read_acc_df[read_acc_df["question_type"]=="relVerifyCop"]

Unnamed: 0,question_type,ACC,total_number,correct_number,need_number
33,relVerifyCop,0.6,5,3,40


In [18]:
read_acc_df

Unnamed: 0,question_type,ACC,total_number,correct_number,need_number
0,materialVerify,1.0,5,5,0
1,existThatC,1.0,5,5,0
2,existMaterialC,1.0,5,5,0
3,locationVerify,1.0,5,5,0
4,relVerifyCo,1.0,5,5,0
...,...,...,...,...,...
97,dir,0.0,5,0,100
98,state,0.0,5,0,100
99,sameRelate,0.0,5,0,100
100,diffAnimalsC,0.0,5,0,100


In [19]:
read_acc_df.need_number.sum()

4380

In [20]:
read_acc_df[read_acc_df["question_type"].eq("relS")]["need_number"].values.item()

100

In [21]:
## filter out challengue question
sampled_questions: dict[str, list[str]] = {}
for qtype in tqdm(test_qtype):

    # get dataset
    questions_of_qtype = filter_train_questions_df[filter_train_questions_df['question_type'].eq(qtype)]

    # get need number
    need_number = read_acc_df[read_acc_df["question_type"].eq(qtype)]["need_number"].values.item()

    if questions_of_qtype.shape[0] < need_number:
        sampled_questions[qtype] = list(questions_of_qtype['question_id'])
    else:
        sampled_questions[qtype] = random.sample(list(questions_of_qtype['question_id']),k=need_number)

  0%|          | 0/102 [00:00<?, ?it/s]

In [22]:
all_sampled_qids = set({qid for qid in itertools.chain.from_iterable(sampled_questions.values())})    

In [23]:
len(all_sampled_qids)

4380

In [24]:
train_subset = [_ for _ in train_all_questions if _['question_id'] in all_sampled_qids]

In [25]:
len(train_subset)

4380

In [26]:
train_subset[0]

{'question_id': '01834249',
 'question_type': 'dir',
 'image_id': '2352541',
 'label': 'right',
 'question': 'Where is the bird on the twig facing?'}

In [27]:
for _ in train_subset:
    _['image_id'] = f"{_['image_id']}.jpg"

In [28]:
def save_train_file(file_path, data):
    with open(file_path, "w") as f:
        for item in data:
            json_str = json.dumps(item)
            f.write(json_str + "\n")
save_train_file("/net/acadia7a/data/fkee/post_process_GQA/hard_question_5_20.jsonl", train_subset)

In [29]:
exisiting_qid_file

[{'file_type': 'nb057_gqa_train_subset_balanced_10_per_qtype',
  'question_qid': ['16218389',
   '15670264',
   '08755604',
   '181066751',
   '11223529',
   '0165933',
   '17725160',
   '1660435',
   '04209634',
   '13684462',
   '01112117',
   '02952749',
   '19866636',
   '03685711',
   '03905466',
   '16393632',
   '14645779',
   '03831804',
   '14228864',
   '13252152',
   '11636737',
   '0313239',
   '0390333',
   '16762559',
   '16156684',
   '11280880',
   '08581894',
   '0195951',
   '15311493',
   '05447785',
   '01834250',
   '0646153',
   '12430977',
   '16434382',
   '011011111',
   '06848356',
   '02181199',
   '05823581',
   '08913252',
   '1812865',
   '04954306',
   '04245809',
   '14697661',
   '0057582',
   '19137837',
   '05238288',
   '111058007',
   '13200784',
   '05371034',
   '06536956',
   '04612280',
   '12309185',
   '06684951',
   '19163099',
   '15797681',
   '18649238',
   '07753271',
   '08480163',
   '09118265',
   '02108766',
   '12646614',
   '0651627

In [30]:
exisiting_qid_file.append({"file_type": "hard_question_5_20", "question_qid":list(all_sampled_qids)})

In [31]:
exisiting_qid = list(set(exisiting_qid + list(all_sampled_qids)))

In [34]:
exisiting_qid_file[3]["question_qid"] = exisiting_qid

In [130]:
type(exisiting_qid_file[3]["question_qid"])

list

In [35]:
len(exisiting_qid_file)

5

In [36]:
[row["file_type"] for row in exisiting_qid_file]

['nb057_gqa_train_subset_balanced_10_per_qtype',
 'nb046_gqa_balanced_training_set',
 'test_file',
 'all_files',
 'hard_question_5_20']

In [37]:
def save_train_file(file_path, data):
    with open(file_path, "w") as f:
        for item in data:
            json_str = json.dumps(item)
            
            f.write(json_str + "\n")

save_train_file("/net/acadia7a/data/fkee/learning_agent/f_script/exisiting_qid_w_hard5k.jsonl", exisiting_qid_file)