# 将bdd100k图片标签，提取为cn-clip要求的格式
cn-clip要求的图片标签格式如下：
![cn-clip-image-id](images/cn-clip-image-id.png)

其中text_id表示标签id，text表示标签名称，image_ids表示对应的图片

bdd100k中的图片标签数据，是一张图片多个标签，在我们这个版本中，是将图片的标签拆开，最终呈现的效果是:
![image-single-tag](images/image-single-tag.png)

In [16]:
import os
import json
import shutil


# 获取标签与标签id影射
def get_category_id_map():
    with open('category_map.json', 'r', encoding='utf-8') as f:
        category_id_map = json.load(f)
    return category_id_map


 # 读取图片字符串名称和数字名称的映射
def get_image_id_map(split):
    image_id_map_filename = "{}_image_id_map.json" . format(split)
    with open(image_id_map_filename, 'r') as f:
        image_id_map = json.load(f)

    return image_id_map


# 从bdd100k图片标签文件中提取标签
def extractTags(jsonFile):
    tags = []
    f = open(jsonFile)
    info = json.load(f)
    objects = info['frames'][0]['objects']
    attributes = info['attributes']

    category_id_map = get_category_id_map()
    for i in objects:
        if i['category'] in category_id_map:
            if i['category'] == 'traffic sign' or i['category'] == 'traffic light':
                name = i['category'] + "/" + i['attributes']['trafficLightColor']
            else:
                name = i['category']
            tagInfo = {
                "name": category_id_map[name]['name'],
                "id": category_id_map[name]['id']
            }
            tags.append(tagInfo)
        else:
            print("new category:" + i['category'])

    if 'weather' in attributes:
        name = 'weather/' + attributes['weather']
        tagInfo = {
            "name": category_id_map[name]['name'],
            "id": category_id_map[name]['id']
        }
        tags.append(tagInfo)

    if 'scene' in attributes:
        name = 'scene/' + attributes['scene']
        tagInfo = {
            "name": category_id_map[name]['name'],
            "id": category_id_map[name]['id']
        }
        tags.append(tagInfo)

    if 'timeofday' in attributes:
        name = 'timeofday/' + attributes['timeofday']
        tagInfo = {
            "name": category_id_map[name]['name'],
            "id": category_id_map[name]['id']
        }
        tags.append(tagInfo)

    # 对图片标签进行去重
    unique_list = list({tuple(d.items()) for d in tags})
    unique_list = [dict(t) for t in unique_list]

    return unique_list


def save_image_by_tag(imageId, tag):
    src_dir = r"E:\playground\ai\datasets\bdd100k\bdd100k_images\bdd100k\images\100k\train_id"
    dst_dir = r"E:\playground\ai\projects\chinese-clip-dataset-transer\custom-dataset\bdd100k-images"
    src_image_path = os.path.join(src_dir, str(imageId) + ".jpg")
    dst_image_path = os.path.join(dst_dir, tag + ".jpg")
    
    print(src_image_path)
    print(dst_image_path)
    
    shutil.copy(src_image_path, dst_image_path)
        

def main(src_dir, dst_dir, split):
    # 读取图片字符串名称和数字名称的映射
    image_id_map = get_image_id_map(split)

    for dir_path, _, filenames in os.walk(src_dir):
        for i, filename in enumerate(filenames):
            print("processing: {}, {}".format(i, filename))
            
            # 提取图片标签
            filepath = os.path.join(dir_path, filename)
            tags = extractTags(str(filepath))

            # 获得图片数字id名称
            imageName = filename.rstrip(".json")
            imageId = image_id_map[imageName]

            if len(tags):
                texts_jsonl_filepath = os.path.join(dst_dir, split + "_texts.jsonl")
                # 以标签-图片的形式保存
                with open(texts_jsonl_filepath, 'a', encoding='utf-8') as f:
                    for tag in tags:
                        imageCNClipJson = {
                            "text_id": tag['id'],
                            "text": tag['name'],
                            "image_ids": [imageId],
                        }
                        
                        # save_image_by_tag(imageId, tag['name'])
                        
                        json.dump(imageCNClipJson, f)
                        f.write('\n')


if __name__ == '__main__':
    srcDir = r'E:\playground\ai\datasets\bdd100k\bdd100k_labels\bdd100k\labels\100k\train'
    dstDir = r'E:\playground\ai\datasets\bdd100kLabelsToCNClip'
    split = 'train'
    main(srcDir, dstDir, split)


processing: 0, 0000f77c-6257be58.json
E:\playground\ai\datasets\bdd100k\bdd100k_images\bdd100k\images\100k\train_id\1.jpg
E:\playground\ai\projects\chinese-clip-dataset-transer\custom-dataset\bdd100k-images\晴朗.jpg
E:\playground\ai\datasets\bdd100k\bdd100k_images\bdd100k\images\100k\train_id\1.jpg
E:\playground\ai\projects\chinese-clip-dataset-transer\custom-dataset\bdd100k-images\汽车.jpg
E:\playground\ai\datasets\bdd100k\bdd100k_images\bdd100k\images\100k\train_id\1.jpg
E:\playground\ai\projects\chinese-clip-dataset-transer\custom-dataset\bdd100k-images\可行驶区域.jpg
E:\playground\ai\datasets\bdd100k\bdd100k_images\bdd100k\images\100k\train_id\1.jpg
E:\playground\ai\projects\chinese-clip-dataset-transer\custom-dataset\bdd100k-images\未知的交通标志.jpg
E:\playground\ai\datasets\bdd100k\bdd100k_images\bdd100k\images\100k\train_id\1.jpg
E:\playground\ai\projects\chinese-clip-dataset-transer\custom-dataset\bdd100k-images\白天.jpg
E:\playground\ai\datasets\bdd100k\bdd100k_images\bdd100k\images\100k\train

OSError: [Errno 22] Invalid argument: 'E:\\playground\\ai\\projects\\chinese-clip-dataset-transer\\custom-dataset\\bdd100k-images\\夜晚.jpg'

使用这种格式的数据进行训练，结果并不理想，猜测大概是两方面的原因：
1. 标签使用的是英文
2. 单个batch-size中，出现相同标签或图片的概率过大

考虑以上两种原因，做了如下调整：
1. 将标签翻译成对应的英文
2. 将图片的多个标签转换为一段话