這個notebook拿來做一些雜務工作用的，譬如將檔案從question_json_all搬移至question_json

In [1]:
import os
import shutil
import json
from dotenv import load_dotenv


# 設定檔案路徑
QUESTION_JSON_DIR = "question_json"
QUESTION_JSON_DONE_DIR = "question_json_done"
QUESTION_BANK_DIR = "question_bank"
QUESTION_IMAGES_DIR = "question_images"

In [2]:
# Mongo連線

from pymongo import MongoClient

load_dotenv()
mongo_id = os.getenv("mongo_id")
mongo_pw = os.getenv("mongo_pw")

connection_url = f"mongodb+srv://{mongo_id}:{mongo_pw}@cluster0.lvdufzc.mongodb.net/"

client = MongoClient(connection_url)

db = client["freeseed"]

collection = db["exams"]

exams = collection.find()

In [None]:
# 將特定的題庫搬移到完成目錄(QUESTION_JSON_DONE_DIR)
# 下方1, 66為題庫編號，可自行調整
for i in range(134, ):

    filename = f"fse{i:08d}.json"
    src_path = os.path.join(QUESTION_JSON_DIR, filename)
    dst_dir = QUESTION_JSON_DONE_DIR

    # 建立目標目錄(如果不存在)
    os.makedirs(dst_dir, exist_ok=True)

    # 搬移檔案
    shutil.move(src_path, os.path.join(dst_dir, filename))
    print(f"已搬移 {filename} 至 {dst_dir}")

In [None]:
# 將question_json_all中所有題庫Header資料複製進同一個檔案裡好做搜尋

folder = os.path.join("question_json_all")

json_files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.json')]


all_data = []

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
        if '題庫' in data:
            del data['題庫']
        all_data.append(data)

# 依id排序
all_data_sorted = sorted(all_data, key=lambda x: x['id'])

# 寫入合併後的JSON檔案
with open('fse_all.json', 'w', encoding='utf-8') as f:
    json.dump(all_data_sorted, f, ensure_ascii=False, indent=2)

print(f"已合併 {len(all_data_sorted)} 個考試資料至 fse_all.json")

In [4]:
# 將符合條件的題庫搬移到question_json資料夾，以便進行parsing

fse_all_path = os.path.join("./", "fse_all.json")

all_data = []

with open(fse_all_path, 'r', encoding='utf-8') as f:
    all_data = json.load(f)


for fse in all_data:
    if (
        fse.get("科目全名", "").find("醫學") != -1
        and fse.get("類科組別", "").startswith("醫師")
        and fse.get("考試年度", "") in ["101", "102", "103"]
    ):
        specific_json_file = fse["id"] + ".json"
        be_copied_file = os.path.join("question_json_all", specific_json_file)
        try:            
            shutil.copy(be_copied_file, os.path.join("question_json", specific_json_file))
        except FileNotFoundError as e:
            print(f"找不到檔案: {be_copied_file}")
            continue

找不到檔案: question_json_all/fse00000122.json
找不到檔案: question_json_all/fse00000123.json
找不到檔案: question_json_all/fse00000124.json
找不到檔案: question_json_all/fse00000125.json
找不到檔案: question_json_all/fse00000126.json
找不到檔案: question_json_all/fse00000127.json


In [27]:
# 將某folder中的所有json檔案與question_json_all中相同檔名的檔案進行比對，並移除question_json_all中相同檔名的檔案

folder_path = os.path.join("question_json_done")

# 取得處理資料夾內所有json檔案列表
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

# 移除question_json_all中相同檔名的檔案
deleted_count = 0
for json_file in json_files:
    target_path = os.path.join("question_json_all", json_file)
    if os.path.exists(target_path):
        os.remove(target_path)
        deleted_count += 1
        print(f"已移除舊檔: {target_path}")

print(f"\n共移除 {deleted_count} 個重複檔案")


已移除舊檔: question_json_all/fse00000292.json
已移除舊檔: question_json_all/fse00000379.json
已移除舊檔: question_json_all/fse00000181.json
已移除舊檔: question_json_all/fse00000251.json
已移除舊檔: question_json_all/fse00000230.json
已移除舊檔: question_json_all/fse00000358.json
已移除舊檔: question_json_all/fse00000323.json
已移除舊檔: question_json_all/fse00000365.json
已移除舊檔: question_json_all/fse00000277.json
已移除舊檔: question_json_all/fse00000386.json
已移除舊檔: question_json_all/fse00000282.json
已移除舊檔: question_json_all/fse00000216.json
已移除舊檔: question_json_all/fse00000344.json
已移除舊檔: question_json_all/fse00000372.json
已移除舊檔: question_json_all/fse00000237.json
已移除舊檔: question_json_all/fse00000330.json
已移除舊檔: question_json_all/fse00000310.json
已移除舊檔: question_json_all/fse00000202.json
已移除舊檔: question_json_all/fse00000351.json
已移除舊檔: question_json_all/fse00000297.json
已移除舊檔: question_json_all/fse00000393.json
已移除舊檔: question_json_all/fse00000223.json
已移除舊檔: question_json_all/fse00000188.json
已移除舊檔: question_json_all/fse000002

In [40]:
## 取出question_json_all中所有的json id, 將mongodb.exams中同id之資料其parsed更新為true

folder_path = os.path.join("question_json_done")

# 取得處理資料夾內所有json檔案列表
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
print(f"Before: {len(json_files)}")

# 取得所有 JSON 檔案的 id (去除副檔名)
json_files = [os.path.splitext(f)[0] for f in json_files]
print(f"After: {len(json_files)}")


# 使用批量更新操作，一次更新所有匹配的資料
collection.update_many(
    {"id": {"$in": json_files}},  # 匹配所有存在於json_files中的_id
    {"$set": {"parsed": True}}     # 將parsed欄位設為true
)

Before: 312
After: 312


UpdateResult({'n': 312, 'electionId': ObjectId('7fffffff000000000000027e'), 'opTime': {'ts': Timestamp(1742482936, 20), 't': 638}, 'nModified': 0, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1742482936, 20), 'signature': {'hash': b'\x08\x1a\xb8D\xc3\xb5\xca\xd07\xcfFZ\x88\xf8!\xbf\xce\xc3\x92~', 'keyId': 7440041778071208022}}, 'operationTime': Timestamp(1742482936, 20), 'updatedExisting': True}, acknowledged=True)

In [None]:
# 載入並統計 fse_all.json 中的實體數量
import json

with open('fse_all.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    if isinstance(data, list):
        print(f"fse_all.json 中共有 {len(data)} 個實體")
    else:
        print(f"fse_all.json 檔案結構不符合預期，頂層類型為 {type(data).__name__}")
