In [2]:
import json
from collections import Counter

# Load the id_tags mapping
with open('id_tags_mapping.json', 'r', encoding='utf-8') as f:
    id_tags = json.load(f)

tag_counts = Counter()
for tags in id_tags.values():
    tag_counts.update(tags)



In [6]:
print(f"Total number of unique IDs: {len(id_tags)}")
print(f"Total unique tags: {len(tag_counts)}\n")

# Show the 10 most common tags and their counts
print("Top 10 most common tags:")
for tag, count in tag_counts.most_common(10):
    print(f"{tag}: {count}")

# Basic statistics
total_tags = sum(tag_counts.values())
avg_tag_usage = total_tags / len(tag_counts)
print(f"\nTotal tag occurrences: {total_tags}")
print(f"Average usage per tag: {avg_tag_usage:.2f}")


Total number of unique IDs: 34558
Total unique tags: 2565

Top 10 most common tags:
黑发: 6971
蓝瞳: 6849
棕发: 5053
金发: 5012
银发: 4832
袜子: 4693
马尾: 4565
红瞳: 4305
黑瞳: 4088
长发: 4074

Total tag occurrences: 325831
Average usage per tag: 127.03


In [9]:
# Get the top 50 tags
top_50_tags = set(tag for tag, _ in tag_counts.most_common(50))

# Count IDs that have at least one tag from top 50
ids_with_top_50 = 0
for id_key, tags in id_tags.items():
    if any(tag in top_50_tags for tag in tags):
        ids_with_top_50 += 1

print(f"IDs with at least one tag from top 50: {ids_with_top_50}")
print(f"Percentage covered: {(ids_with_top_50 / len(id_tags) * 100):.2f}%")

Total IDs: 34558
IDs with at least one tag from top 50: 33163
Percentage covered: 95.96%


In [10]:
# Get the top 100 tags
top_100_tags = set(tag for tag, _ in tag_counts.most_common(100))

# Count IDs that have at least one tag from top 100
ids_with_top_100 = 0
for id_key, tags in id_tags.items():
    if any(tag in top_100_tags for tag in tags):
        ids_with_top_100 += 1

print(f"IDs with at least one tag from top 100: {ids_with_top_100}")
print(f"Percentage covered: {(ids_with_top_100 / len(id_tags) * 100):.2f}%")

Total IDs: 34558
IDs with at least one tag from top 100: 33806
Percentage covered: 97.82%


In [15]:
print([x[0] for x in tag_counts.most_common(200)])

['黑发', '蓝瞳', '棕发', '金发', '银发', '袜子', '马尾', '红瞳', '黑瞳', '长发', '绿瞳', '棕瞳', '金瞳', '学生', '巨乳', '紫瞳', '蓝发', '刘海', '发饰', '萝莉', '长直', '双马尾', '短发', '动物(萌属性)', '过膝袜', '眼镜', '靴子', '紫发', '呆毛', '特殊第一人称', '元气', '红发', 'A型', '傲娇', '粉发', '妹妹', '反差萌', 'O型', '兽耳', '温柔', '高中生', '刀剑', '姐姐', 'B型', '天然呆', '贫乳', '绿发', '兽娘', '手套', '辫子', '连裤袜', '帽子', '腹黑', '尾巴', '御姐', '偶像(萌属性)', 'AB型', '黑长直', '黑色过膝袜', '组织领导人', '麻花辫', '灰瞳', '大小姐', '蝴蝶结', '披风', '卷发', '本名不明', '青梅竹马', '绝对领域', '高马尾', '黑色连裤袜', '橙发', '长鬓角', '粉瞳', '毒舌', '橙瞳', '白色过膝袜', '吃货', '天才', '金发碧眼', '翅膀', '军人', '枪械', '长刘海', '特殊瞳孔', '角', '虚拟UP主', '发箍', '和服', '哥哥', '优等生', '妖怪', '过膝靴', '长靴', '实妹', '机器人', '百合', '神明', '露脐装', '丝带', '乐器(萌属性)', '口癖', '强气', '高跟鞋', '水手服', '齐刘海', '耳饰', '教师', '遮眼发', '人妻', '发夹', '双胞胎', '认真', '忍者', '失忆', '料理达人', '无口', '及膝袜', '吐槽', '挑染', '魔法少女', '国家元首', '君主', '颜艺', '制服', '舰娘', '猫', '机娘', '弟弟', '正太', '治愈系', '虎牙', '遮单眼发', '中分', '第一人称Atashi', '三无', '西装', '实姐', '笨蛋', '歌手(萌属性)', 'S属性', '玩家角色', '第三人称己称', '短裤', '褐色皮肤', '分离袖子', '黑化', '混血儿', '转学生', '下双马

In [27]:
selected_tags = [
    '黑发', '蓝瞳', '棕发', '金发', '银发', '马尾', '红瞳', '黑瞳', '长发', '绿瞳', 
    '棕瞳', '金瞳', '学生', '巨乳', '紫瞳', '蓝发', '刘海', '发饰', '萝莉', '长直', '双马尾', '短发', 
    '动物(萌属性)', '过膝袜', '眼镜', '靴子', '紫发', '呆毛', '元气', '红发', '傲娇', 
    '粉发', '妹妹', '反差萌', '兽耳', '温柔', '高中生', '刀剑', '姐姐', '天然呆', '贫乳', '绿发', 
    '兽娘', '手套', '辫子', '连裤袜', '帽子', '腹黑', '尾巴', '御姐', '偶像(萌属性)', '黑长直', '黑色过膝袜', 
    '麻花辫', '灰瞳', '大小姐', '蝴蝶结', '披风', '卷发', '本名不明', '青梅竹马', '绝对领域', '高马尾', 
    '黑色连裤袜', '橙发', '长鬓角', '粉瞳', '毒舌', '橙瞳', '白色过膝袜', '吃货', '天才', '金发碧眼', '翅膀', '军人', 
    '枪械', '长刘海', '特殊瞳孔', '角', '虚拟UP主', '发箍', '和服', '哥哥', '优等生', '妖怪', '过膝靴', '长靴', '实妹', 
    '机器人', '百合', '神明', '露脐装', '丝带', '乐器(萌属性)', '口癖', '强气', '高跟鞋', '水手服', '齐刘海', '耳饰', 
    '教师', '遮眼发', '人妻', '发夹', '双胞胎', '认真', '忍者', '失忆', '料理达人', '无口', '及膝袜', '吐槽', '挑染', 
    '魔法少女', '颜艺', '制服', '舰娘', '猫', '机娘', '弟弟', '正太', '治愈系', '虎牙', '遮单眼发', '中分', '三无', '西装', 
    '实姐', '笨蛋', '歌手(萌属性)', 'S属性', '短裤', 
    '褐色皮肤', '分离袖子', '黑化', '混血儿', '转学生', '下双马尾', '弱气', '变身', '双角', '孤儿', '佣人', '老好人', '短靴', 
    '中长发', '公主', '冰美人', '大叔', '尖耳朵', '美少女', '御宅族', '腿环', '大蝴蝶结', '领带', '姬发式', '女仆', 
    '泳装', '裸足', '丸子头', '连衣裙', '人外', '魔法师', '商人', 'M形刘海', '妈妈', '小天使', '吊带袜', '围巾', 
    '面具', '科学家', '小恶魔系', '猫耳', '超能力者', '异色瞳', '恶魔'
    ]
selected_tags_set = set(selected_tags)

In [31]:
# Create a substitution dictionary for tags that need to be renamed
tag_substitutions = {
    '偶像(萌属性)': '偶像',
    '动物(萌属性)': '动物',
    '乐器(萌属性)': '乐器',
    '歌手(萌属性)': '歌手',
}

# Create new tag_counts with substitutions
new_tag_counts = Counter()
for tag, count in tag_counts.items():
    # If tag needs substitution, use the substituted version
    new_tag = tag_substitutions.get(tag, tag)
    new_tag_counts[new_tag] += count

# Create new filtered mapping with substitutions and sorted tags
filtered_id_tags = {}
for id_key, tags in id_tags.items():
    # Keep only tags that are in selected_tags_set, applying substitutions where needed
    filtered_tags = []
    for tag in tags:
        if tag in selected_tags_set:
            # Apply substitution if it exists, otherwise keep original tag
            filtered_tags.append(tag_substitutions.get(tag, tag))
    
    if filtered_tags:  # Only include IDs that have at least one selected tag
        # Sort tags by their frequency in new_tag_counts (descending)
        filtered_tags.sort(key=lambda x: new_tag_counts[x], reverse=True)
        filtered_id_tags[id_key] = filtered_tags

# Print statistics
print(f"Total IDs: {len(id_tags)}")
print(f"Number of selected tags: {len(selected_tags_set)}")
print(f"IDs in filtered mapping: {len(filtered_id_tags)}")
print(f"Percentage covered: {(len(filtered_id_tags) / len(id_tags) * 100):.2f}%")

# Save the filtered mapping to a new file
with open('filtered_id_tags_mapping.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_id_tags, f, ensure_ascii=False, indent=2)

Total IDs: 34558
Number of selected tags: 176
IDs in filtered mapping: 33876
Percentage covered: 98.03%


In [32]:
# Create a substitution dictionary for tags that need to be renamed
tag_substitutions = {
    '偶像(萌属性)': '偶像',
    '动物(萌属性)': '动物',
    '乐器(萌属性)': '乐器',
    '歌手(萌属性)': '歌手',
}

# Create new tag_counts with substitutions
new_tag_counts = Counter()
for tag, count in tag_counts.items():
    # If tag needs substitution, use the substituted version
    new_tag = tag_substitutions.get(tag, tag)
    new_tag_counts[new_tag] += count

# Create new filtered mapping with substitutions, sorted tags, and integer keys
filtered_id_tags = {}
for id_key, tags in id_tags.items():
    # Convert id_key to integer
    int_key = int(id_key)
    
    # Keep only tags that are in selected_tags_set, applying substitutions where needed
    filtered_tags = []
    for tag in tags:
        if tag in selected_tags_set:
            # Apply substitution if it exists, otherwise keep original tag
            filtered_tags.append(tag_substitutions.get(tag, tag))
    
    if filtered_tags:  # Only include IDs that have at least one selected tag
        # Sort tags by their frequency in new_tag_counts (descending)
        filtered_tags.sort(key=lambda x: new_tag_counts[x], reverse=True)
        filtered_id_tags[int_key] = filtered_tags

# Print statistics
print(f"Total IDs: {len(id_tags)}")
print(f"Number of selected tags: {len(selected_tags_set)}")
print(f"IDs in filtered mapping: {len(filtered_id_tags)}")
print(f"Percentage covered: {(len(filtered_id_tags) / len(id_tags) * 100):.2f}%")


# Save the filtered mapping to a new file
with open('filtered_id_tags_mapping.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_id_tags, f, ensure_ascii=False, indent=2)

Total IDs: 34558
Number of selected tags: 176
IDs in filtered mapping: 33876
Percentage covered: 98.03%
