### Prepare data for training

In [50]:
import pandas as pd
import os.path as osp
from huggingface_hub import HfApi
import huggingface_hub as hf_hub
from datasets import load_dataset
import os

annotations_location = r"z:/data/labels/labels.csv"
images_folder = r"z:/data/images"
dataset_folder = r"z:/data/dataset"
metada_file = "metadata.jsonl"

In [2]:
hf_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [53]:
df_ann = pd.read_csv(annotations_location, encoding="utf-8")
df_ann.drop(columns=["image"], inplace=True)
df_ann["File Name"] = df_ann["File Name"].apply(lambda x: x + ".jpg")
df_ann.head()

Unnamed: 0,caption,text_comments,site_description,image_title,collections,File Name
0,Happy Tuesday!! \n\nLast week’s follower favor...,"['All the fall feels…so pretty 😍😍', '@2sweette...",Black spindle back chair dining set with stain...,Black Spindle Back Chair Dining Room,2051 spindle dining chair ideas,lasthouseonbedfordlane361359607318.jpg
1,Do you like floating shelves? \n\nThese were p...,"['Love! Beautifully styled shelves!', '@softan...",Corner beside drapes includes floating wood sh...,Wood Floating Shelves with White Accents,2275 floating shelves ideas,lasthouseonbedfordlane829711129417.jpg
2,Happy Monday! \n\nDid you do anything fun this...,"['Y’all looked beautiful ❤️', '@karinv1022 tha...",White and black metal outdoor patio chairs are...,White and Gray Braided Outdoor Patio Rug,2089 Farmhouse rug ideas for living room,lasthouseonbedfordlane716070625097.jpg
3,That was then - this is now! Our living room ...,"['Wow ❤️', '@samtweich ❤️❤️', 'I’ve never swap...",This gray living room features a round stained...,White and Gray Living Room Rug,2089 Farmhouse rug ideas for living room,lasthouseonbedfordlane161631564633.jpg
4,There’s no rule that says you need to spend a ...,"['So beautiful xx', '@wisteria.house_ thank yo...",Round light wood accent table is flanked by bl...,Living Room with Gray Vintage Area Rug,"2089 Farmhouse rug ideas for living room, 2149...",lasthouseonbedfordlane830876955035.jpg


### Split the training set

In [6]:
from sklearn.model_selection import train_test_split
from glob import glob
import shutil

all_files = glob(osp.join(images_folder, "*.jpg"))
train_val_files, test_files = train_test_split(all_files, test_size=0.15, random_state=42)
train_files, val_files = train_test_split(train_val_files, test_size=0.15, random_state=42)


In [67]:
train_dir = osp.join(dataset_folder, "train")
val_dir = osp.join(dataset_folder, "validation")
test_dir = osp.join(dataset_folder, "test")

train_metadata = osp.join(train_dir, metada_file)
val_metadata = osp.join(val_dir, metada_file)
test_metadata = osp.join(test_dir, metada_file)

def copy_files(files, dest_dir, del_existing=False):
  
  if not osp.exists(dest_dir):
    os.mkdir(dest_dir)
  elif del_existing:
    os.rmtree(dest_dir)
    
  for f in files:
      shutil.copy(f, dest_dir)

In [13]:
for files in zip([train_files, val_files, test_files], [train_dir, val_dir, test_dir]):
  copy_files(*files, del_existing=True)

### Create Dictionary for matching files

In [19]:
import orjson

In [57]:
def create_metadata(filter_files, metadata_file, *, df = df_ann):
  
  filter_files = [osp.basename(f) for f in filter_files]
  df_filtered = df[df["File Name"].isin(filter_files)]
  label_dict = df_filtered.set_index('File Name')['image_title'].to_dict()
  metadata = []
  
  for fn, label in label_dict.items():
    strg = orjson.dumps({"file_name": fn, "text": label}).decode("utf-8", "ignore").encode("utf-8")
    metadata.append(strg)
    
  with open(metadata_file, "wb") as f:
    f.writelines(metadata)


In [58]:
create_metadata(train_files, train_metadata)
create_metadata(test_files, test_metadata)
create_metadata(val_files, val_metadata)

### Upload to HuggingFace Hub

In [59]:
api = HfApi()
repo_name = "soul11zz/image-caption-desc-only"
repo_url = api.create_repo(repo_name, private=True, exist_ok=True, repo_type="dataset")


In [65]:
def upload_to_hub(repo_name, dataset_folder, split):
  
  dataset = load_dataset("imagefolder", data_dir=dataset_folder, split=split)
  dataset.push_to_hub(repo_name)

In [None]:
#dataset_folder = r"/Users/berno/Dropbox/zakhar/data/dataset"
upload_to_hub(repo_name, dataset_folder, "train")


In [None]:

upload_to_hub(repo_name, dataset_folder, "validation")
upload_to_hub(repo_name, dataset_folder, "test")

In [43]:
osp.exists(dataset_folder)

True