# 1. 
Scan over the whole original csv and encode each tag into a tag index, forming functions `tag2index` `index2tag`

In [1]:
import pandas as pd
import tqdm

In [78]:
df = pd.read_csv("ori_tags.csv")
df.head()

Unnamed: 0,id,img_src,tags,types
0,1,//safebooru.org//samples/1/sample_e7b3dc281d43...,"1girl,bag,black hair,blush,bob cut,bowieknife,...","tag-type-general,tag-type-general,tag-type-gen..."
1,2,//safebooru.org//samples/1/sample_27ff11b17a2c...,"barding,black,cape,celty sturluson,dress,dulla...","tag-type-general,tag-type-general,tag-type-gen..."
2,3,//safebooru.org//samples/1/sample_ebd16eb1d154...,"blue eyes,blush,brown hair,original,scan,takoy...","tag-type-general,tag-type-general,tag-type-gen..."
3,4,//safebooru.org//samples/1/sample_6fbb9a4b9099...,"game cg,hagall valkyr,mecha musume,shirogane n...","tag-type-general,tag-type-character,tag-type-g..."
4,6,//safebooru.org//samples/1/sample_113cb63dc541...,"blush,idolmaster,kisaragi chihaya,komi zumiko,...","tag-type-general,tag-type-copyright,tag-type-c..."


# 2. 
Count all tags using `Counter`, as our task only focus on background removal, just find the most popular 512 `general`(which I call it `attribute`) tags

In [None]:
from collections import Counter
tags_count = Counter()
err = []
for _ in tqdm.tqdm(range(len(df))):
    img = df.iloc[_]
    tags = img['tags'].split(",")
    tags_type = img['types'].split(",")
    # Here are some error crawled data whose tags and tags_type can not match
    if len(tags)==len(tags_type):
        for i,t in enumerate(tags):
            if tags_type[i]=="tag-type-general":
                tags_count[t]+=1
    else:
        err.append(_)

In [42]:
tag2index = {}
index2tag = {}
for i,k in enumerate(tags_count):
    tag = k[0]
    tag2index[tag]=i
    index2tag[i] = tag

# 3. 
Filter out those images which don't include these 512 tag
# 4. 
Reindex these tags to 0~511

In [80]:
attribute_tags = tags_count.most_common(512)
most_common_tags = []
commontag2index = {}
index2commontag = {}
for i,k in enumerate(attribute_tags):
    commontag2index[k[0]]=i
    index2commontag[i]=k[0]
    most_common_tags.append(k[0])
    print(k)
most_common_tags = set(most_common_tags)

('solo', 1146592)
('1girl', 1066555)
('long hair', 1065307)
('highres', 847419)
('smile', 712395)
('short hair', 688384)
('blush', 567867)
('looking at viewer', 529824)
('open mouth', 498766)
('blue eyes', 471414)
('breasts', 467375)
('blonde hair', 464963)
('multiple girls', 447898)
('brown hair', 437928)
('skirt', 435995)
('hat', 419104)
('black hair', 369230)
('red eyes', 364148)
('dress', 337524)
('ribbon', 321530)
('gloves', 309668)
('hair ornament', 301210)
('simple background', 290922)
('thigh-highs', 286132)
('school uniform', 282261)
('bow', 278287)
('twintails', 275824)
('brown eyes', 266425)
('translation request', 252073)
('green eyes', 235082)
('white background', 231191)
('blue hair', 225044)
('monochrome', 223617)
('2girls', 220068)
('bangs', 214385)
('sitting', 211839)
('weapon', 210262)
('comic', 203962)
('cleavage', 202957)
('long sleeves', 200933)
('shirt', 197192)
('animal ears', 195516)
('jewelry', 188254)
('very long hair', 181549)
('closed eyes', 180714)
('hair r

In [None]:
cleaned_img_id = []
cleaned_img_tags_index = []
#for _ in tqdm.tqdm(range(10)):
for _ in tqdm.tqdm(range(len(df))):
    img = df.iloc[_]
    tags = img['tags'].split(",")
    tags_index = []
    for i,t in enumerate(tags):
        if t in most_common_tags:                
            tags_index.append(commontag2index[t])
    cleaned_img_id.append(img["id"])
    tags_index.sort()
    cleaned_img_tags_index.append(tags_index)


In [102]:
cleaned_df = pd.DataFrame({"id":cleaned_img_id,"attr_index":cleaned_img_tags_index})  
cleaned_df.head()

Unnamed: 0,attr_index,id
0,"[0, 1, 5, 6, 14, 16, 20, 70, 81, 92, 115, 119,...",1
1,"[0, 3, 18, 117]",2
2,"[6, 9, 13, 23, 26, 83, 297]",3
3,"[63, 86, 485]",4
4,"[6, 297]",6


# 5. 
Construct a dict which maps `image_id` to `attr_index`. And we should be aware of those pictures which are not successful downloaded, they cound cause pytorch dataloader exception, so we need to clean them by using `os.path.exists`.Finally, we cache it to `img_id2attr.pkl`, so that the training set is produced

In [None]:
import numpy as np
imgid2attr={}
for i,id in tqdm.tqdm(enumerate(cleaned_img_id)):
    attr = np.array(cleaned_img_tags_index[i])
    imgid2attr[id]=attr

In [6]:
import os
import tqdm
path = "../home4/i2v_data/images/"
ids = imgid2attr.keys()
missing_img = []
for i in tqdm.tqdm(ids):
    found = False
    for _ in [".jpg",".png",".jpeg",".gif"]:
        img_path = os.path.join(path, str(i) + _)
        if os.path.exists(img_path):
            #print(img_path)
            found = True
            break
    if not found:
        missing_img.append(i)

100%|██████████| 2530261/2530261 [00:19<00:00, 130217.91it/s]


In [None]:
print(len(missing_img))
for i in missing_img:
    if i in imgid2attr.keys():
        print(i)
        del imgid2attr[i]

In [5]:
import pickle
with open("imgid2attr.pkl","wb") as f:
    pkl = pickle.dumps(imgid2attr)
    f.write(pkl)