In [None]:
!pip install -U datasets huggingface-hub
!pip install -U accelerate

### Restart the notebook before proceeding further

In [1]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('<your_token>')"

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [1]:
import os
from PIL import Image
import pandas as pd
import collections
from tqdm import tqdm

import torch
from transformers import CLIPProcessor, CLIPModel
from torchvision import transforms

# **Creating DataFrame Object for Storing All Images**

In [3]:
def fetch_image_names(path):
    return os.listdir(path)

In [4]:
def append_df(df, path, images):
    for img_type, img_names in images.items():
        new_row = []
        new_row = [[path + img_type + '/' + img_name, int(img_type[0])] for img_name in img_names]
        df = pd.concat([df, pd.DataFrame(new_row, columns=["file_path", "label"])], ignore_index=True)
        
    return df

In [5]:
df = pd.DataFrame({
    "file_path": [],
    "label": []
}, dtype=int)

base_dir = "/kaggle/input/diffusion-datasets/diffusion_datasets/"
folders = os.listdir(base_dir)
for folder in folders:
    if collections.Counter(os.listdir(base_dir + folder)) == collections.Counter(['1_fake']):
        fake_images = fetch_image_names(base_dir + folder+'/'+'1_fake')
        path = folder + '/'
        images = {'1_fake': fake_images}
        df = append_df(df, path, images)
    else:
        real_images = fetch_image_names(base_dir + folder+'/'+'0_real')
        path = folder + '/'
        images = {'0_real': real_images}
        df = append_df(df, path, images)

# **Generating Feature Space**

In [8]:
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to("cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]



preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [9]:
convert_tensor = transforms.ToTensor()

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, x):
        self.x = x
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        image = Image.open(base_dir + self.x.iloc[idx]['file_path']).resize((256, 256))
        image = convert_tensor(image)
        
        if not image.shape[0] == 3:
            print(f"yes at index {idx}")
            image = image.repeat(3, 1, 1)
            
        return idx, image

In [10]:
df['img_embed'] = ''

In [11]:
batch_size = 16

dataloader = torch.utils.data.DataLoader(dataset=CustomDataset(df),
                                        batch_size=batch_size,
                                        shuffle=False)

In [None]:
# start_index = 0
# end_index = start_index + batch_size

for idx, data in tqdm(dataloader):
    images = data*255
    inputs = processor(text='nothing', images=images, return_tensors="pt", padding=True).to("cuda")
    outputs = model(**inputs)
    outputs = outputs['image_embeds'].tolist()
#     outputs = [str(obj) for obj in outputs]
    idx = idx.tolist()
    df['img_embed'].iloc[idx] = outputs
    
#     df.loc[idx, 'img_embed'] = outputs
#     start_index = end_index
#     end_index = start_index+batch_size if start_index+batch_size <= len(df) else len(df)

In [14]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['file_path', 'label', 'img_embed'],
    num_rows: 10000
})

In [15]:
dataset.push_to_hub("rajendrabaskota/diffusion-test-dataset", split="test")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/rajendrabaskota/diffusion-test-dataset/commit/6353fa65e037fd3aa33039821d7fa4ae602e3bad', commit_message='Upload dataset', commit_description='', oid='6353fa65e037fd3aa33039821d7fa4ae602e3bad', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from datasets import Dataset, load_dataset, concatenate_datasets

In [None]:
test_dataset_all = load_dataset("rajendrabaskota/gans-test-dataset")
test_dataset_all

In [None]:
splits = list(test_dataset_all.keys())
test_dataset = concatenate_datasets([test_dataset_all[split] for split in splits])

In [None]:
test_dataset

In [None]:
test_dataset.push_to_hub("rajendrabaskota/gan-test-dataset", split="test")

# **Storing and Pushing to Hub**

In [None]:
!mkdir mydata

In [None]:
df1 = df.iloc[60000:70000]

In [None]:
%cd mydata

In [None]:
df1.to_csv("data.csv", index=False)

In [None]:
df1.to_parquet('dataset-gans-60000-70000.parquet')  # If your file name isn't dataset.parquet replace it accordingly

In [None]:
!git add .
!git commit -m "added test data with image embeddings"
!git push

In [None]:
!huggingface-cli lfs-enable-largefiles /kaggle/working/gans-test-dataset-2

In [None]:
!kaggle datasets init -p /kaggle/working/mydata

In [None]:
%cd /root/.kaggle

In [None]:
import json

# Data to be written
dictionary = {"username":"pistukispo", "key":"398889364b2ed7a796587383380d0f3e"}

# Serializing json
json_object = json.dumps(dictionary, indent=4)

# Writing to sample.json
with open("/root/.kaggle/kaggle.json", "w") as outfile:
	outfile.write(json_object)

In [None]:
import json
 
# Opening JSON file
with open('/kaggle/working/mydata/dataset-metadata.json', 'r') as openfile:
    # Reading from json file
    json_object = json.load(openfile)
 
print(json_object)
print(type(json_object))

In [None]:
json_object['title'] = "gans-test-dataset-last"
json_object['id'] = "pistukispo/gans-test-dataset-last"

In [None]:
json_object

In [None]:
json_object = json.dumps(json_object, indent=4)

# Writing to sample.json
with open("/kaggle/working/mydata/dataset-metadata.json", "w") as outfile:
	outfile.write(json_object)

In [None]:
!kaggle datasets create -p /kaggle/working/mydata

In [None]:
temp = pd.DataFrame({'a': [1,2,3],
                    'b': [4,5,6],
                    'c': [7,8,9]})
temp.to_csv('temp.csv', index=False)

In [None]:
!kaggle datasets version -p /kaggle/working/mydata -m "Your message here"

In [None]:
%cd /kaggle/working

In [None]:
df1 = df[:50000]

In [None]:
!mkdir data

In [None]:
%cd data

In [None]:
df1.to_csv("data-1.csv", index=False)

In [None]:
from torchvision.models import resnet50, ResNet50_Weights
from torchvision import transforms

resnet_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2).to("cuda")
convert_tensor = transforms.ToTensor()
image = Image.open("/kaggle/input/few-ai-real-images/dalle.png").resize((224, 224))
image = convert_tensor(image)

resnet_model.eval()
resnet_model.training # to verify if the model is in eval mode
resnet_model(image.unsqueeze(dim=0)) # not necessary to unsqueeze when sent in batches

#optional
for param in resnet_model.parameters():
    param.requires_grad = False