### Prepare data for training

In [None]:
import pandas as pd
import os.path as osp
from huggingface_hub import HfApi
import huggingface_hub as hf_hub
from datasets import load_dataset
import os

annotations_location = r"z:/data/labels/labels.csv"
images_folder = r"z:/data/images"
dataset_folder = r"z:/data/dataset"
metada_file = "metadata.jsonl"

In [None]:
hf_hub.notebook_login()

In [None]:
df_ann = pd.read_csv(annotations_location, encoding="utf-8")
df_ann.drop(columns=["image"], inplace=True)
df_ann["File Name"] = df_ann["File Name"].apply(lambda x: x + ".jpg")
df_ann.head()

### Remove entries with empty captions

In [None]:
empty_image_titles = df_ann[df_ann["image_title"].isna()]["File Name"].values

for im_fn in empty_image_titles:
    if osp.exists(osp.join(images_folder, im_fn)):
        print(f"Removing {im_fn}")
        os.remove(osp.join(images_folder, im_fn))
df_ann = df_ann[~df_ann["File Name"].isin(empty_image_titles)]

### Split the training set

In [None]:
from sklearn.model_selection import train_test_split
from glob import glob
import shutil

all_files = glob(osp.join(images_folder, "*.jpg"))
train_val_files, test_files = train_test_split(all_files, test_size=0.15, random_state=42)
train_files, val_files = train_test_split(train_val_files, test_size=0.15, random_state=42)


In [None]:
train_dir = osp.join(dataset_folder, "train")
val_dir = osp.join(dataset_folder, "validation")
test_dir = osp.join(dataset_folder, "test")

train_metadata = osp.join(train_dir, metada_file)
val_metadata = osp.join(val_dir, metada_file)
test_metadata = osp.join(test_dir, metada_file)

def copy_files(files, dest_dir, del_existing=False):
  
  if not osp.exists(dest_dir):
    os.mkdir(dest_dir)
  elif del_existing:
    shutil.rmtree(dest_dir)
    os.mkdir(dest_dir)
    
  for f in files:
      shutil.copy(f, dest_dir)

In [None]:
for files in zip([train_files, val_files, test_files], [train_dir, val_dir, test_dir]):
  copy_files(*files, del_existing=True)

### Create Dictionary for matching files

In [None]:
import orjson

In [None]:
def create_metadata(filter_files, metadata_file, *, df = df_ann):
  
  filter_files = [osp.basename(f) for f in filter_files]
  df_filtered = df[df["File Name"].isin(filter_files)]
  label_dict = df_filtered.set_index('File Name')['image_title'].to_dict()
  metadata = []
  
  for fn, label in label_dict.items():
    strg = orjson.dumps({"file_name": fn, "text": label}).decode("utf-8", "ignore").encode("utf-8")
    metadata.append(strg)
    
  with open(metadata_file, "wb") as f:
    f.writelines(metadata)


In [None]:
create_metadata(train_files, train_metadata)
create_metadata(test_files, test_metadata)
create_metadata(val_files, val_metadata)

### Upload to HuggingFace Hub

In [None]:
api = HfApi()
repo_name = "soul11zz/image-caption-desc-only"
repo_url = api.create_repo(repo_name, private=True, exist_ok=True, repo_type="dataset")


In [None]:
def upload_to_hub(repo_name, dataset_folder, split):
  
  dataset = load_dataset("imagefolder", data_dir=dataset_folder, split=split)
  dataset.push_to_hub(repo_name)

In [None]:
#dataset_folder = r"/Users/berno/Dropbox/zakhar/data/dataset"
upload_to_hub(repo_name, dataset_folder, "train")


In [None]:

upload_to_hub(repo_name, dataset_folder, "validation")
upload_to_hub(repo_name, dataset_folder, "test")