## Mix the datasets

In [8]:
import pandas as pd
import os.path as osp
from huggingface_hub import HfApi
import huggingface_hub as hf_hub
from datasets import load_dataset
import os
from tqdm.notebook import tqdm
from urllib.parse import urlparse
from glob import glob

root_dir = osp.abspath(r"../../data/")
annotations_location = osp.join(root_dir, "labels")
images_folder = osp.join(root_dir, "images")
dataset_folder = r"z:/data/dataset"
metada_file = "metadata.jsonl"

In [25]:
ann_files = glob(osp.join(annotations_location, "*.csv"))
ann_files

['/notebooks/data/labels/labels_sandl.csv',
 '/notebooks/data/labels/labels_dp.csv']

### Load Big Dataset Images

In [44]:
df_ann = pd.read_csv(ann_files[1], encoding="utf-8")

df_ann["Image_Local"] += ".png"
df_ann = df_ann.rename(columns={"Image_Local": "image", "Description": "description"})
df_ann = df_ann[["image", "description"]]


### Load S&L Images

In [45]:
df_ann_sl = pd.read_csv(ann_files[0], encoding="utf-8")
df_ann_sl["image"] = df_ann_sl["image"].apply(lambda x: osp.basename(urlparse(x).path))
df_ann_sl.rename(columns={"site_description":"description"}, inplace=True)
df_ann_sl = df_ann_sl[["image", "description"]]
df_ann_sl.head()

Unnamed: 0,image,description
0,lasthouseonbedfordlane361359607318.jpg,Black spindle back chair dining set with stain...
1,lasthouseonbedfordlane829711129417.jpg,Corner beside drapes includes floating wood sh...
2,lasthouseonbedfordlane716070625097.jpg,White and black metal outdoor patio chairs are...
3,lasthouseonbedfordlane161631564633.jpg,This gray living room features a round stained...
4,lasthouseonbedfordlane830876955035.jpg,Round light wood accent table is flanked by bl...


### Join the dataframes and remove entities with empty descriptions

In [46]:
df_ann = pd.concat([df_ann, df_ann_sl], ignore_index=True)

print(f"Before removing empty: {len(df_ann)}")

empty = df_ann[df_ann["description"].isna()]["image"].values

for im_fn in empty:
    if osp.exists(osp.join(images_folder, im_fn)):
        print(f"Removing {im_fn}")
        os.remove(osp.join(images_folder, im_fn))

df_ann = df_ann[~df_ann["image"].isin(empty)]        
print(f"After removing empty: {len(df_ann)}")

Before removing empty: 111763
Removing thegatewayhome311981081117.jpg
Removing rootedinlove.design895513567315.jpg
Removing stagerroz202980943618.jpg
Removing amywilsondesigns206438777234.jpg
Removing thewillowwindow870582023741.jpg
Removing houseonwren122513937102.jpg
Removing westcottonwoodlane715985889232.jpg
Removing ellisandhale587216792975.jpg
Removing thefontaineflat320622163311.jpg
Removing softandsouthern626491771670.jpg
Removing brasshouseinteriors677243133095.jpg
Removing truemanstreasures857187242591.jpg
Removing our1917farmhouse223357021536.jpg
Removing theangabode456786576592.jpg
Removing the.sycamore.farmhouse278007911678.jpg
Removing thejoyfilledfarmhouse557834028543.jpg
Removing shelley__bates248338008687.jpg
Removing thesamesstyle3186174895.jpg
Removing dawn.hoefler376222325591.jpg
Removing becshomestyle765753583511.jpg
Removing ourhomeoncolonial51709203637.jpg
Removing athomeontheboulevard627787745990.jpg
Removing athomeontheboulevard382757506293.jpg
Removing howell.