In [None]:
! git clone "https://github.com/TheoCoombes/crawlingathome" crawlingathome_client
! pip3 install -r crawlingathome_client/requirements.txt --no-cache-dir
! rm requirements.txt
! wget https://raw.githubusercontent.com/rvencu/crawlingathome-gpu-hcloud/staged-clients/requirements.txt
! pip3 install -r ./requirements.txt --no-cache-dir
! pip3 install ftfy pandas tfr_image
! pip3 install tensorflow --no-cache-dir
! pip3 install clip-anytorch
! yes | pip3 uninstall pillow
! CC="cc -mavx2" pip3 install -U --force-reinstall pillow-simd

In [None]:
#@title GPU controlled Hetznet Cloud swarm of workers
YOUR_NICKNAME_FOR_THE_LEADERBOARD = "Colab-GPU-hcloud" #@param {type:"string"}
groupsize = 16 #@param {type:"string"}
CRAWLINGATHOME_SERVER_URL = "http://cah.io.community/"
groupsize = int(groupsize)

In [1]:
import os
import re
import sys
import time
import uuid
import clip
import torch
import pickle
import shutil
import threading
import pandas as pd
from glob import glob
from PIL import Image
from pathlib import Path
from statistics import mode
import crawlingathome_client as cah
sys.path.append('./crawlingathome-worker/')
from multiprocessing import JoinableQueue, Process, cpu_count

if not os.path.exists("./stats/"):
    os.makedirs("./stats/")
if not os.path.exists("./save/"):
    os.makedirs("./save/")

# initial cleanup - delete all working files in case of crash recovery
reg_compile = re.compile(r"^\d{1,3}-\d{1,3}-\d{1,3}-\d{1,3}$")
for root, dirnames, filenames in os.walk("."):
    for filename in filenames:
        if filename.startswith("gpujob.zip_"):
            os.remove(filename)
    for dir in dirnames:
        if reg_compile.match(dir):
            shutil.rmtree(dir)

ModuleNotFoundError: No module named 'colorama'

In [None]:
#initialize joinable queues to transfer messages between multiprocess processes
# Outbound queues, we need one for each io worker
outbound = []
for _ in range(2 * groupsize): # we need 2x IO workers to keep GPU permanently busy
        outbound.append(JoinableQueue())
inbound = JoinableQueue()
counter = JoinableQueue()
inpsize = JoinableQueue() # use this to communicate number of jobs downloading now
gpuflag = JoinableQueue() # use this to flag that gpu is processing

# define CLIP class around OpenAI clip model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
class CLIPDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, preprocess):
        self.dataframe = dataframe
        self.image_transform = preprocess
        self.tokenizer = clip.tokenize

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        return (
            self.image_transform(Image.open(row["PATH"])),
            self.tokenizer(row["TEXT"], truncate_text=True)[0],
        )

class CLIP:
    def __init__(self):
        self.model, self.preprocess = clip.load("ViT-B/32", device=device, jit=False)
        self.cosine_similarity = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
        with torch.no_grad():
            self.categories = self.model.encode_text(clip.tokenize(["neutral","selfie", "illustration, drawing", "toys, play, kids, children", "teddy bear, puppet", "animal, bird, mammal, insect" "fashion, clothes", "logo, commercial, ad, advertisement", "drawing, painting","anime, cartoon","comedy, fun","romance, love story","thriller, suspense, crime story","action, action movie", "horror, monster movie", "documentary", "news, journalism", "entertainment", "talk show", "porn, sex, sperm, nipples, breats, tits, boops, penis, dick, cock, clitoris, vagina, fuck, lust, horny, sexual, lick, licking",  "porn, sex, sperm, nipples", "porn, sex, sperm, penis, dick, cock", "nipples, breats, tits, boops, sexy", "penis, dick, cock", "clitoris, vagina", "sex, fuck, lust, horny, sexual, lick, licking", "porn, sex, sexy","sexy, hot","sperm, skin","lust, horny, sexual","lick, licking, body", "anime, hentai, sexy", "cartoon, sexy, sex", "hentai", "anime, sexy, breasts", "hentai"]).to(device))
            self.underaged_categories = self.model.encode_text(clip.tokenize(["teenager, teen", "kid, child, teenager, teen, baby or toddler, underaged, little girl, little boy", "kid, child, little girl, little boy", "baby, toddler","adult, woman, man, grownup, grown person,full-aged of legal age","full-aged, of legal age, adult","woman, man","adult, woman, man, grownup, grown person,full-aged of legal age"]).to(device))
            self.animal_categories = self.model.encode_text(clip.tokenize(["lifeless object, thing", "thing, object", "material", "furniture","wall", "house", "tree", "wood","ground","industry", "table", "bed", "tool", "dress, clothes", "door", "chair", "rock, stone", "human", "man", "woman", "man, woman", "animal","cat","dog", "cow", "pig", "goat", "sheep", "elephant", "horse", "horse, elephant, pig, dog, cat, sheep, goat, animal", "life", "wildlife"]).to(device))

    def similarity_imgalt(self, image_tensor, text_tokens):
        with torch.no_grad():
            image_features = self.model.encode_image(image_tensor.to(device)).float()
            text_features = self.model.encode_text(text_tokens.to(device)).float()
            similarity = self.cosine_similarity(image_features, text_features).tolist()

        image_features = image_features.detach().cpu().numpy()
        return image_features, similarity

    def preprocess_images(self, df):
        ret_image_features = []
        ret_similarity = []
        batch_size = 256 if device == "cuda" else 8
        dataset = CLIPDataset(df, self.preprocess)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=int(2*cpu_count()/3), pin_memory=True)
        for tensors, tokens in dataloader:
            image_features, similarities = self.similarity_imgalt(tensors, tokens)
            ret_image_features.extend(image_features)
            ret_similarity.extend(similarities)
        return ret_image_features, ret_similarity

    def prob(self, image_features, text_features):
        text_features = text_features.float()
        image_features = torch.as_tensor(image_features).to(device, dtype=torch.float32)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # cosine similarity as logits
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        _, indices = similarity.topk(2)
        return indices


clip_filter = CLIP()


def df_clipfilter(df):
    sim_threshold = 0.3
    underaged_text = ["teen", "kid", "child", "baby"]

    img_embedding, similarities = clip_filter.preprocess_images(df)
    tmp_embed = []

    for i, img_embed in enumerate(img_embedding):
        if similarities[i] < sim_threshold:
            #df.drop(i, inplace=True)
            df.at[i, 'dropped'] = True
            continue

        # get most similar categories
        nsfw_prob = clip_filter.prob(img_embed, clip_filter.categories)
        df.at[i, "NSFW"] = "UNSURE"
        df.at[i, "similarity"] = similarities[i]
        if nsfw_prob[0] < 19 and nsfw_prob[1] < 19:
            df.at[i, "NSFW"] = "UNLIKELY"
            tmp_embed.append(img_embed)
            continue
        elif nsfw_prob[0] >= 19 and nsfw_prob[1] >= 19:
            df.at[i, "NSFW"] = "NSFW"

        underage_prob = clip_filter.prob(img_embed, clip_filter.underaged_categories)
        if underage_prob[0] < 4 or underage_prob[1] < 4 or any(x in df.at[i, "TEXT"] for x in underaged_text):
            #df.drop(i, inplace=True)
            df.at[i, 'dropped'] = True
            continue

        animal_prob = clip_filter.prob(img_embed, clip_filter.animal_categories)
        if animal_prob[0] > 20:
            #df.drop(i, inplace=True)
            df.at[i, 'dropped'] = True
            continue
        tmp_embed.append(img_embed)
        df.at[i, 'dropped'] = False
        
    df = df[df["dropped"] != True]
    df.reset_index(drop=True, inplace=True)
    return tmp_embed, df


def df_tfrecords(df, output_fname):
    import tensorflow as tf
    from tfr_image.utils import bytes_feature, int64_feature

    def image_to_tfexample(sample_id, image_data, image_format, height, width, caption):
        return tf.train.Example(
            features=tf.train.Features(
                feature={
                    "sampleID": bytes_feature(sample_id),
                    "image": bytes_feature(image_data),
                    "format": bytes_feature(image_format),
                    "label": bytes_feature(caption),
                    "height": int64_feature(height),
                    "width": int64_feature(width),
                }
            )
        )

    with tf.io.TFRecordWriter(output_fname) as tfrecord_writer:
        for i in range(len(df)):
            df_image = df.iloc[i]
            image_fname = df_image["PATH"]
            file_type = image_fname.split(".")[-1]
            with tf.io.gfile.GFile(image_fname, "rb") as f:
                image_data = f.read()
            example = image_to_tfexample(
                str(df_image["SAMPLE_ID"]).encode("utf_8"),
                image_data,
                file_type.encode("utf_8"),
                df_image["HEIGHT"],
                df_image["WIDTH"],
                df_image["TEXT"].encode("utf_8"),
            )
            tfrecord_writer.write(example.SerializeToString())


def filter(df, out_fname, output_folder):
    results = []
    #start0 = start = time.time()
    img_embeddings, dff = df_clipfilter(df)
    dff.to_csv(f"{output_folder}{out_fname}.csv", index=False, sep="|")

    #count results for each worker from resulting dff
    dff["shard"] = dff.apply(lambda row: str(row.PATH).split("/")[1], axis=1)
    results = dff["shard"].value_counts()
    #print(f"CLIP ran in {round(time.time()-start,2)}")
    #start = time.time()
    img_embeds_sampleid = {}
    for i, img_embed_it in enumerate(img_embeddings):
        dfid_index = dff.at[i, "SAMPLE_ID"]
        img_embeds_sampleid[str(dfid_index)] = img_embed_it
    with open(f"{output_folder}image_embedding_dict-{out_fname}.pkl", "wb") as f:
        pickle.dump(img_embeds_sampleid, f)
    #print(f"Embeddings ran in {round(time.time()-start,2)}")
    #start = time.time()
    df_tfrecords(
        dff,
        f"{output_folder}crawling_at_home_{out_fname}__00000-of-00001.tfrecord",
    )
    #print(f"Tfrecords ran in {round(time.time()-start,2)}")
    #print(f"Job ran in {round(time.time()-start0,2)}")
    return len(dff), results


# define workers

In [None]:
def gpu_cah_interface(i:int, incomingqueue: JoinableQueue, outgoingqueue: JoinableQueue, YOUR_NICKNAME_FOR_THE_LEADERBOARD, CRAWLINGATHOME_SERVER_URL):
    # initiate and reinitiate a GPU type client if needed
    print (f"   |___ inbound worker started")
    while True:
        client = cah.init(
            url=CRAWLINGATHOME_SERVER_URL, nickname=YOUR_NICKNAME_FOR_THE_LEADERBOARD, type="GPU"
        )
        while client.isAlive():
            while client.jobCount() > 0: 
                # each thread gets a new job, passes it to GPU then waits for completion
                try:
                    client.newJob()
                except:
                    time.sleep(10)
                    continue
                job = client.shard
                os.mkdir("./"+ job)
                response = os.system(f"rsync -rzh archiveteam@88.198.2.17::gpujobs/{job}/* {job}") # no not delete just yet the source files
                if response != 0:
                    client.invalidURL()
                    print (f"[io] invalid job detected: {job}")
                    continue
                else:
                    os.system(f"mv {job}/*_parsed.csv stats/")
                    os.system(f"mv {job}/*_unfiltered.csv stats/")
                    print (f"[io] job sent to GPU: {job}")
                    incomingqueue.put((i, job, client.upload_address))
                
                # wait until job gets processes
                while True:
                    if outgoingqueue.qsize() > 0:
                        outjob, pairs = outgoingqueue.get() # I am poping out from queue only if my current job is finished
                        print (f"[io] received results for: {job}={outjob}")
                        outgoingqueue.task_done()
                        if pairs > 0:
                            print (f"[io] mark job as complete: {job}")
                            client.completeJob(int(pairs))
                        shutil.rmtree("./"+ job)
                        break # we can let the worker request a new job
                    else:
                        time.sleep(1)
            else:
                print (f"[io] no jobs")
                time.sleep(10)
        else:
            print (f"[io] client forgotten")
            time.sleep(10)

def io_worker(incomingqueue: JoinableQueue, outgoingqueue: list, groupsize: int, YOUR_NICKNAME_FOR_THE_LEADERBOARD, CRAWLINGATHOME_SERVER_URL):
    # separate process to initialize threaded workers
    print (f"[io] inbound workers:")
    try:
        # just launch how many threads we need to group jobs into single output
        for i in range(2 * groupsize):
            threading.Thread(target=gpu_cah_interface, args=(i, incomingqueue, outgoingqueue[i], YOUR_NICKNAME_FOR_THE_LEADERBOARD, CRAWLINGATHOME_SERVER_URL)).start()
    except Exception as e:
        print(f"[io] some inbound problem occured: {e}")


def gpu_worker(incomingqueue: JoinableQueue, outgoingqueue: list, counter: JoinableQueue, gpuflag: JoinableQueue, groupsize: int):
    print (f"[gpu] worker started")
    # watch for the incoming queue, when it is big enough we can trigger processing    
    while True:
        print (f"[gpu] testing incoming queue size")
        if incomingqueue.qsize() >= groupsize:
            gpuflag.put(1)
            shards = []
            addresses = []
            group_id = uuid.uuid4().hex
            print (f"[gpu] got new {groupsize} jobs to group in id {group_id}")
            group_parse = None
            for _ in range(groupsize):
                i, job, address = incomingqueue.get()

                all_csv_files = []
                for path, subdir, files in os.walk(job):
                    for file in glob(os.path.join(path, "*.csv")):
                        all_csv_files.append(file)
                # get name of csv file
                out_path = all_csv_files[0]
                shards.append((i, job, Path(out_path).stem.strip("_unfiltered").strip("_parsed").strip(".")))
                addresses.append(address)

                incomingqueue.task_done()
            print (f"[gpu] adjusted image paths")

            for i, job, item in shards:
                dlparse_df = pd.read_csv(job + "/" + item + ".csv", sep="|")
                dlparse_df["PATH"] = dlparse_df.apply(lambda x: "./" + job + "/" + x["PATH"].strip("save/"), axis=1)
                if group_parse is None:
                    group_parse = dlparse_df
                else:
                    group_parse = group_parse.append(dlparse_df, ignore_index=True)
                
            with open("./save/" + group_id + ".txt", "wt") as f:
                for i, job, item in shards:
                    f.write(item + "\n")
            
            print (f"[gpu] saving stats")

            group_parse.to_csv("./stats/" + group_id + "_groupduped.csv", index=False, sep="|") # I am using these to find out domains to filter from scraping
            group_parse.drop_duplicates(subset=["URL","TEXT"], keep='last', inplace=True)
            group_parse.reset_index(inplace=True, drop=True)

            group_parse.to_csv("./stats/" + group_id + "_groupdeduped.csv", index=False, sep="|") # I am using these to find out domains to filter from scraping

            print (f"[gpu] sending group to CLIP filter")
            start = time.time()
            final_images, results = filter(group_parse, group_id, "./save/")
            print(f"last filtered {final_images} images in {round(time.time()-start,2)} sec")

            print (f"[gpu] upload group results to rsync target")
            # find most required upload address among the grouped shards
            upload_address = mode(addresses)
            print (f"most requested upload address is {upload_address}")
            response = os.system(f"rsync -zh --remove-source-files save/*{group_id}* {upload_address}") # to do get target from client
            if response == 0:
                print (f"[gpu] sending all jobs to be marked as completed")
                for i, job, item in shards:
                    outgoingqueue[i].put((job, results.get(job)))
                    counter.put(1)
            else:
                for i, job, item in shards:
                    outgoingqueue[i].put((job, 0)) # if upload crashes, then do NOT mark completeJob()
            print (f"[gpu] cleaning up group folders")
            
            gpuflag.get()
            gpuflag.task_done()
        else:
            time.sleep(10)

# start all processes

In [None]:
io = Process(target=io_worker, args=[inbound, outbound, groupsize, YOUR_NICKNAME_FOR_THE_LEADERBOARD, CRAWLINGATHOME_SERVER_URL], daemon=True).start()

gpu_worker(inbound, outbound, counter, gpuflag, groupsize)