In [None]:
!pip install -U datasets huggingface-hub
!pip install -U accelerate

### Restart the notebook before proceeding further

In [None]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('<your_token>')"

from huggingface_hub import notebook_login
notebook_login()

In [None]:
import os
from PIL import Image
import pandas as pd
import collections
from tqdm import tqdm

import torch
from transformers import CLIPProcessor, CLIPModel
from torchvision import transforms

device = "cuda" if torch.cuda.is_available() else "cpu"

# Run any of the below three sections then move to the section **Loading CLIP:Vit-L/14**

# **1) Creating DataFrame Object for ProGAN Train Dataset**
* Add the kaggle dataset: **ai-vs-human-generated-images**

In [None]:
def fetch_image_names(path):
    return os.listdir(path)

In [None]:
def append_df(df, path, images):
    for img_type, img_names in images.items():
        new_row = []
        new_row = [[path + img_type + '/' + img_name, int(img_type[0])] for img_name in img_names]
        df = pd.concat([df, pd.DataFrame(new_row, columns=["file_path", "label"])], ignore_index=True)
        
    return df

In [None]:
df = pd.DataFrame({
    "file_path": [],
    "label": []
}, dtype=int)

folders = os.listdir("/kaggle/input/ai-vs-human-generated-images")
for folder in folders:
    if collections.Counter(os.listdir("/kaggle/input/ai-vs-human-generated-images/"+folder)) == collections.Counter(['0_real', '1_fake']):
        real_images = fetch_image_names("/kaggle/input/ai-vs-human-generated-images/"+folder+'/'+'0_real')
        fake_images = fetch_image_names("/kaggle/input/ai-vs-human-generated-images/"+folder+'/'+'1_fake')
        path = folder + '/'
        images = {'0_real': real_images, '1_fake': fake_images}
        df = append_df(df, path, images)
    else:
        for sub_folder in os.listdir("/kaggle/input/ai-vs-human-generated-images/"+folder):
            real_images = fetch_image_names("/kaggle/input/ai-vs-human-generated-images/"+folder+'/'+sub_folder+'/'+'0_real')
            fake_images = fetch_image_names("/kaggle/input/ai-vs-human-generated-images/"+folder+'/'+sub_folder+'/'+'1_fake')
            path = folder + '/' + sub_folder + '/'
            images = {'0_real': real_images, '1_fake': fake_images}
            df = append_df(df, path, images)

# **2) Creating DataFrame Object for GAN Test Dataset**
* Add the kaggle dataset: **GANs-dataset**

In [None]:
def fetch_image_names(path):
    return os.listdir(path)

In [None]:
def append_df(df, path, images):
    for img_type, img_names in images.items():
        new_row = []
        new_row = [[path + img_type + '/' + img_name, int(img_type[0])] for img_name in img_names]
        df = pd.concat([df, pd.DataFrame(new_row, columns=["file_path", "label"])], ignore_index=True)
        
    return df

In [None]:
df = pd.DataFrame({
    "file_path": [],
    "label": []
}, dtype=int)

folders = os.listdir("/kaggle/input/progan-fake-dataset")
for folder in folders:
    if collections.Counter(os.listdir("/kaggle/input/progan-fake-dataset/"+folder)) == collections.Counter(['0_real', '1_fake']):
        real_images = fetch_image_names("/kaggle/input/progan-fake-dataset/"+folder+'/'+'0_real')
        fake_images = fetch_image_names("/kaggle/input/progan-fake-dataset/"+folder+'/'+'1_fake')
        path = folder + '/'
        images = {'0_real': real_images, '1_fake': fake_images}
        df = append_df(df, path, images)
    else:
        for sub_folder in os.listdir("/kaggle/input/progan-fake-dataset/"+folder):
            real_images = fetch_image_names("/kaggle/input/progan-fake-dataset/"+folder+'/'+sub_folder+'/'+'0_real')
            fake_images = fetch_image_names("/kaggle/input/progan-fake-dataset/"+folder+'/'+sub_folder+'/'+'1_fake')
            path = folder + '/' + sub_folder + '/'
            images = {'0_real': real_images, '1_fake': fake_images}
            df = append_df(df, path, images)

# **3) Creating DataFrame Object for Diffusion Test Dataset**
* Add the kaggle dataset: **Diffusion-datasets**

In [None]:
def fetch_image_names(path):
    return os.listdir(path)

In [None]:
def append_df(df, path, images):
    for img_type, img_names in images.items():
        new_row = []
        new_row = [[path + img_type + '/' + img_name, int(img_type[0])] for img_name in img_names]
        df = pd.concat([df, pd.DataFrame(new_row, columns=["file_path", "label"])], ignore_index=True)
        
    return df

In [None]:
df = pd.DataFrame({
    "file_path": [],
    "label": []
}, dtype=int)

base_dir = "/kaggle/input/diffusion-datasets/diffusion_datasets/"
folders = os.listdir(base_dir)
for folder in folders:
    if collections.Counter(os.listdir(base_dir + folder)) == collections.Counter(['1_fake']):
        fake_images = fetch_image_names(base_dir + folder+'/'+'1_fake')
        path = folder + '/'
        images = {'1_fake': fake_images}
        df = append_df(df, path, images)
    else:
        real_images = fetch_image_names(base_dir + folder+'/'+'0_real')
        path = folder + '/'
        images = {'0_real': real_images}
        df = append_df(df, path, images)

# **Loading CLIP:Vit-L/14**

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

# **Creating Dataset and DataLoader**

In [None]:
convert_tensor = transforms.ToTensor()

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, x):
        self.x = x
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        image = Image.open(base_dir + self.x.iloc[idx]['file_path']).resize((256, 256))
        image = convert_tensor(image)
        
        if not image.shape[0] == 3:
            print(f"yes at index {idx}")
            image = image.repeat(3, 1, 1)
            
        return idx, image

In [None]:
df['img_embed'] = ''

In [None]:
batch_size = 16

dataloader = torch.utils.data.DataLoader(dataset=CustomDataset(df),
                                        batch_size=batch_size,
                                        shuffle=False)

# **Generating Feature Space**

In [None]:
for idx, data in tqdm(dataloader):
    images = data*255
    inputs = processor(text='nothing', images=images, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    outputs = outputs['image_embeds'].tolist()
    idx = idx.tolist()
    df['img_embed'].iloc[idx] = outputs

# **Storing Generated Feature Space on the Hub**

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset

In [None]:
dataset.push_to_hub("<dataset_name>", split="<train/test>")