## Mix the datasets

In [2]:
import pandas as pd
import os.path as osp
from huggingface_hub import HfApi
import huggingface_hub as hf_hub
from datasets import load_dataset
import os
from tqdm.notebook import tqdm
from urllib.parse import urlparse
from glob import glob

root_dir = osp.abspath(r"../../data/")
annotations_location = osp.join(root_dir, "labels")
images_folder = osp.join(root_dir, "images")
dataset_folder = osp.join(root_dir, "dataset")
metada_file = "metadata.jsonl"

In [3]:
ann_files = glob(osp.join(annotations_location, "*.csv"))
ann_files

['/notebooks/data/labels/labels_sandl.csv',
 '/notebooks/data/labels/labels_dp.csv']

### Load Big Dataset Images

In [4]:
df_ann = pd.read_csv(ann_files[1], encoding="utf-8")

df_ann["Image_Local"] += ".png"
df_ann = df_ann.rename(columns={"Image_Local": "image", "Description": "description"})
df_ann = df_ann[["image", "description"]]


### Load S&L Images

In [5]:
df_ann_sl = pd.read_csv(ann_files[0], encoding="utf-8")
df_ann_sl["image"] = df_ann_sl["image"].apply(lambda x: osp.basename(urlparse(x).path))
df_ann_sl.rename(columns={"site_description":"description"}, inplace=True)
df_ann_sl = df_ann_sl[["image", "description"]]
df_ann_sl.head()

Unnamed: 0,image,description
0,lasthouseonbedfordlane361359607318.jpg,Black spindle back chair dining set with stain...
1,lasthouseonbedfordlane829711129417.jpg,Corner beside drapes includes floating wood sh...
2,lasthouseonbedfordlane716070625097.jpg,White and black metal outdoor patio chairs are...
3,lasthouseonbedfordlane161631564633.jpg,This gray living room features a round stained...
4,lasthouseonbedfordlane830876955035.jpg,Round light wood accent table is flanked by bl...


### Join the dataframes and remove entities with empty descriptions

In [6]:
df_ann = pd.concat([df_ann, df_ann_sl], ignore_index=True)

print(f"Before removing empty: {len(df_ann)}")

empty = df_ann[df_ann["description"].isna()]["image"].values

for im_fn in empty:
    if osp.exists(osp.join(images_folder, im_fn)):
        print(f"Removing {im_fn}")
        os.remove(osp.join(images_folder, im_fn))

df_ann = df_ann[~df_ann["image"].isin(empty)]        
print(f"After removing empty: {len(df_ann)}")

Before removing empty: 111763
After removing empty: 111677


### Split the training set

In [7]:
from sklearn.model_selection import train_test_split
from glob import glob
import shutil

all_files = glob(osp.join(images_folder, "*.jpg")) + glob(osp.join(images_folder, "*.png")) 
train_val_files, test_files = train_test_split(all_files, test_size=0.03, random_state=42)
train_files, val_files = train_test_split(train_val_files, test_size=0.03, random_state=42)

In [9]:
train_dir = osp.join(dataset_folder, "train")
val_dir = osp.join(dataset_folder, "validation")
test_dir = osp.join(dataset_folder, "test")

train_metadata = osp.join(train_dir, metada_file)
val_metadata = osp.join(val_dir, metada_file)
test_metadata = osp.join(test_dir, metada_file)

def copy_files(files, dest_dir, del_existing=False):
  
  if not osp.exists(dest_dir):
    os.makedirs(dest_dir)
  elif del_existing:
    shutil.rmtree(dest_dir)
    os.makedirs(dest_dir)
    
  for f in files:
      shutil.copy(f, dest_dir)

In [12]:
for files in zip([train_files, val_files, test_files], [train_dir, val_dir, test_dir]):
  copy_files(*files, del_existing=True)

In [49]:
def get_files(dir):
    if osp.exists(osp.join(dir, "metadata.jsonl")):
        os.remove(osp.join(dir, "metadata.jsonl"))
    return os.listdir(dir)

train_files = get_files(train_dir)
test_files = get_files(test_dir)
val_files = get_files(val_dir)


In [51]:
len(test_files)

3420

### Create Metadata for matching files

In [53]:
import orjson

def create_metadata(filter_files, metadata_file, *, df = df_ann):
  
  filter_files = [osp.basename(f) for f in filter_files]
  df_filtered = df[df["image"].isin(filter_files)]
  label_dict = df_filtered.set_index('image')['description'].to_dict()
  metadata = []
  
  for fn, label in label_dict.items():
    strg = orjson.dumps({"file_name": fn, "text": str(label).strip()}).decode("utf-8", "ignore") + "\n"
    strg = strg.encode("utf-8")
    metadata.append(strg)
    
  with open(metadata_file, "wb") as f:
    f.writelines(metadata)


In [54]:
create_metadata(train_files, train_metadata)
create_metadata(test_files, test_metadata)
create_metadata(val_files, val_metadata)

In [23]:
len(train_files)

107235

### Upload to HuggingFace

In [18]:
hf_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [55]:
api = HfApi()
repo_name = "soul11zz/image-description-large"
repo_url = api.create_repo(repo_name, private=True, exist_ok=True, repo_type="dataset")


In [61]:
def upload_to_hub(repo_name, dataset_folder):
  
  dataset = load_dataset("imagefolder", data_dir=dataset_folder)
  #dataset.cleanup_cache_files()
  dataset.push_to_hub(repo_name)

In [62]:

upload_to_hub(repo_name, dataset_folder)


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

Using custom data configuration default-89d301f0cfff4141


Downloading and preparing dataset imagefolder/default to /root/.cache/huggingface/datasets/imagefolder/default-89d301f0cfff4141/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597...
                

Downloading data files #10:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

                

Downloading data files #13:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

                

Downloading data files #11:   0%|          | 0/3 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/3 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/3 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/3 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/4 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/3 [00:00<?, ?obj/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to /root/.cache/huggingface/datasets/imagefolder/default-89d301f0cfff4141/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Pushing split train to the Hub.


  0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


  0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


  0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
upload_to_hub(repo_name, dataset_folder, "validation")
upload_to_hub(repo_name, dataset_folder, "test")

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

Using custom data configuration default-89d301f0cfff4141
Reusing dataset imagefolder (/root/.cache/huggingface/datasets/imagefolder/default-89d301f0cfff4141/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597)


  0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

Using custom data configuration default-89d301f0cfff4141
Reusing dataset imagefolder (/root/.cache/huggingface/datasets/imagefolder/default-89d301f0cfff4141/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597)


  0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Updating downloaded metadata with the new split.
