In [1]:
!pip install pandas numpy tqdm



In [2]:
import os
import glob
import shutil
import re

import numpy as np
from tqdm import tqdm
import pandas as pd

In [7]:
FILENAMES = glob.glob("/project_antwerp/data/preprocessing/images/*.jpg")

In [8]:
FILENAMES[:2]

['/project_antwerp/data/preprocessing/images/4133_1933.jpg',
 '/project_antwerp/data/preprocessing/images/1175_1941.jpg']

In [35]:
PATTERN = r"/project_antwerp/data/preprocessing/images/\d+_(?P<YEAR>\d\d\d\d)"
OUTPUT_DIR = "/project_antwerp/data/ShorpyText"
MOVE = False
SEED = 42
TRAIN_SET = 0.8

In [36]:
FILENAMES = list(FILENAMES)

In [37]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [38]:
def to_5_years(year: int) -> str:
    if year is None:
        return "None"
    century, decade, unit = str(year)[:2], str(year)[2], str(year)[3]
    if int(unit) < 5:
        return f"{century}{decade}0-4"
    return f"{century}{decade}5-9"

In [39]:
def resolve_year(filename):
    try:
        match = re.match(PATTERN, filename)
        return int(match.group("YEAR"))
    except:
        return None

In [40]:
def resolve_id(filename):
    try:
        id_pattern = r"/project_antwerp/notebooks/dataset/images/(?P<ID>\d+)_\d+.jpg"
        match = re.match(id_pattern, filename)
        return int(match.group("ID"))
    except:
        return None

In [41]:
years = [resolve_year(f) for f in FILENAMES]

In [42]:
years[:10]

[1933, 1941, 1924, 1941, 1932, 1913, 1900, 1953, 1928, 1936]

In [43]:
labels = [to_5_years(y) for y in years]

In [44]:
labels[:10]

['1930-4',
 '1940-4',
 '1920-4',
 '1940-4',
 '1930-4',
 '1910-4',
 '1900-4',
 '1950-4',
 '1925-9',
 '1935-9']

In [61]:
new_filenames = []

for filename, year, label in zip(FILENAMES, years, labels):
    basename = os.path.basename(filename)
    new_filename = os.path.join(label, basename)
    new_filenames.append(new_filename)

In [55]:
df = pd.read_csv(os.path.join("info.csv"))

In [56]:
df["set"] = None
df["set_filename"] = None
df["label"] = None

In [54]:
df = pd.DataFrame({
    "filename": None,
    "year": years,
    "label": labels,
    "set": None,
    "index": idxs
})

In [57]:
msk = np.random.rand(len(df)) < TRAIN_SET
df.loc[msk, "set"] = "train"
df.loc[~msk, "set"] = "test"

In [58]:
msk = df["label"] == 'None'
df.loc[msk, "set"] = "excluded"

In [59]:
msk = df["year"] < 1850
df.loc[msk, "set"] = "excluded"

In [62]:
df["filename"] = [os.path.join(set_, f) for set_, f in zip(df["set"], new_filenames)]

In [63]:
df = df[df["label"] != None]

In [57]:
df.set_index('index', inplace=True)

In [59]:
df.join(text_df, on="index", rsuffix="text")

Unnamed: 0_level_0,filename,year,label,set,yeartext,src,text,downloaded
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12358,train/1910-4/12358_1910.jpg,1910.0,1910-4,train,1914.0,https://www.shorpy.com/files/images/04783a.pre...,"Washington, D.C., circa 1914. ""Three-horse tea...",True
4133,train/1930-4/4133_1933.jpg,1933.0,1930-4,train,1901.0,https://www.shorpy.com/files/images/SHORPY-4a0...,"Circa 1901. ""Gratiot Light, Port Huron, Michig...",True
10353,test/1860-4/10353_1862.jpg,1862.0,1860-4,test,1928.0,https://www.shorpy.com/files/images/05923u.pre...,"Fredericksburg, Virginia, circa 1928. ""Willis ...",True
14384,train/1935-9/14384_1938.jpg,1938.0,1935-9,train,1943.0,https://www.shorpy.com/files/images/1a35433u.p...,"Extracted sulfur stacked in a ""vat"" 60 feet ta...",True
1175,train/1940-4/1175_1941.jpg,1941.0,1940-4,train,1939.0,https://www.shorpy.com/files/images/SHORPY-8b2...,"October 1939. ""Insignia of nationally affiliat...",True
...,...,...,...,...,...,...,...,...
12000,train/1910-4/12000_1910.jpg,1910.0,1910-4,train,1917.0,https://www.shorpy.com/files/images/08250a.pre...,"1917. ""Mrs. Van H. Manning. U.S. Bureau of Min...",True
6101,train/1955-9/6101_1959.jpg,1959.0,1955-9,train,1910.0,https://www.shorpy.com/files/images/SHORPY_4a2...,"Mobile, Alabama, circa 1910. ""A pretty bit of ...",True
5418,train/1915-9/5418_1919.jpg,1919.0,1915-9,train,1936.0,https://www.shorpy.com/files/images/SHORPY-200...,"Washington, D.C., circa 1936. ""Hard, William.""...",True
3795,train/1900-4/3795_1904.jpg,1904.0,1900-4,train,1903.0,https://www.shorpy.com/files/images/SHORPY-4a1...,"""Loading steamer Chalmette during high water, ...",True


In [65]:
del df["set_filename"]

In [66]:
df.to_csv(os.path.join(OUTPUT_DIR, "info.csv"), index=False)

In [67]:
for old_filename, new_filename, in tqdm(zip(FILENAMES, df["filename"]), total=len(new_filenames)):
    new_filename = os.path.join(OUTPUT_DIR, new_filename)
    if os.path.exists(new_filename):
        continue
    os.makedirs(os.path.dirname(new_filename), exist_ok=True)
    if MOVE:
        shutil.move(old_filename, new_filename)
    else:
        shutil.copy(old_filename, new_filename)

100%|██████████| 15268/15268 [00:39<00:00, 389.64it/s]


In [68]:
df.shape

(15268, 6)

In [79]:
df.drop_duplicates().shape

(10744, 4)