In [1]:
import os
import sys

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print("Project root:", PROJECT_ROOT)

Project root: d:\SDSU\Data Analyst Study\travel-photo-analytics


In [2]:
import os
import pandas as pd
from tqdm import tqdm

from utils.exif_utils import extract_exif


In [3]:
PHOTO_DIR = "../data/raw_photos"
ALLOWED_EXT = (".jpg", ".jpeg", ".png")

photos = []
for root, _, files in os.walk(PHOTO_DIR):
    for f in files:
        if f.lower().endswith(ALLOWED_EXT):
            photos.append(os.path.join(root, f))

len(photos), photos[:3]


(65,
 ['../data/raw_photos\\IMG-1699.jpg',
  '../data/raw_photos\\IMG-1759.jpg',
  '../data/raw_photos\\IMG-1796.jpg'])

In [4]:
records = []
for p in tqdm(photos):
    records.append(extract_exif(p))

df = pd.DataFrame(records)
df.head(), df.columns.tolist()


  0%|          | 0/65 [00:00<?, ?it/s]

100%|██████████| 65/65 [00:03<00:00, 16.32it/s]


(                        photo_path     file_name file_ext  \
 0  ../data/raw_photos\IMG-1699.jpg  IMG-1699.jpg     .jpg   
 1  ../data/raw_photos\IMG-1759.jpg  IMG-1759.jpg     .jpg   
 2  ../data/raw_photos\IMG-1796.jpg  IMG-1796.jpg     .jpg   
 3  ../data/raw_photos\IMG-1811.jpg  IMG-1811.jpg     .jpg   
 4  ../data/raw_photos\IMG-1861.jpg  IMG-1861.jpg     .jpg   
 
          timestamp_raw  timestamp_source     camera             gps_lat  \
 0  2018:03:28 13:34:01  DateTimeOriginal  iPhone 6s   [36, 36, 247/100]   
 1  2018:03:28 19:02:21  DateTimeOriginal  iPhone 6s  [34, 24, 1009/100]   
 2  2018:03:28 19:21:24  DateTimeOriginal  iPhone 6s      [34, 24, 25/2]   
 3  2018:03:28 22:04:44  DateTimeOriginal  iPhone 6s     [34, 0, 291/10]   
 4  2018:03:29 15:31:46  DateTimeOriginal  iPhone 6s     [34, 7, 301/25]   
 
   gps_lat_ref              gps_lon gps_lon_ref  exif_present status  \
 0           N  [121, 53, 1581/100]           W          True     ok   
 1           N     [119,

In [5]:
df["timestamp_parsed"] = pd.to_datetime(
    df["timestamp_raw"],
    format="%Y:%m:%d %H:%M:%S",
    errors="coerce"
)

df["has_timestamp"] = df["timestamp_parsed"].notna()
df["has_gps"] = df["gps_lat"].notna() & df["gps_lon"].notna()
df["is_ok"] = df["status"].eq("ok")

df.sort_values("timestamp_parsed", inplace=True, na_position="last")

df[["file_name", "timestamp_source", "timestamp_raw", "has_timestamp", "has_gps", "status"]].head(10)


Unnamed: 0,file_name,timestamp_source,timestamp_raw,has_timestamp,has_gps,status
0,IMG-1699.jpg,DateTimeOriginal,2018:03:28 13:34:01,True,True,ok
1,IMG-1759.jpg,DateTimeOriginal,2018:03:28 19:02:21,True,True,ok
2,IMG-1796.jpg,DateTimeOriginal,2018:03:28 19:21:24,True,True,ok
3,IMG-1811.jpg,DateTimeOriginal,2018:03:28 22:04:44,True,True,ok
4,IMG-1861.jpg,DateTimeOriginal,2018:03:29 15:31:46,True,True,ok
5,IMG-1873.jpg,DateTimeOriginal,2018:03:29 15:51:07,True,True,ok
6,IMG-1890.jpg,DateTimeOriginal,2018:03:29 18:43:44,True,True,ok
7,IMG-1913.jpg,DateTimeOriginal,2018:03:29 18:46:59,True,True,ok
8,IMG-1915.jpg,DateTimeOriginal,2018:03:29 18:49:09,True,True,ok
9,IMG-1917.jpg,DateTimeOriginal,2018:03:29 18:49:46,True,True,ok


In [6]:
summary = {
    "total_photos": int(len(df)),
    "ok_rows": int(df["is_ok"].sum()),
    "error_rows": int((~df["is_ok"]).sum()),
    "exif_present_pct": float(df["exif_present"].mean()),
    "has_timestamp_pct": float(df["has_timestamp"].mean()),
    "has_gps_pct": float(df["has_gps"].mean()),
}
summary


{'total_photos': 65,
 'ok_rows': 65,
 'error_rows': 0,
 'exif_present_pct': 1.0,
 'has_timestamp_pct': 1.0,
 'has_gps_pct': 0.18461538461538463}

In [10]:
ordered_cols = [
    "file_name","file_ext","photo_path",
    "timestamp_raw","timestamp_source","timestamp_parsed","has_timestamp",
    "gps_lat","gps_lat_ref","gps_lon","gps_lon_ref","has_gps",
    "camera","exif_present","status","error_message","is_ok"
]
df = df[[c for c in ordered_cols if c in df.columns]]


In [7]:
OUTPUT_DIR = "../data/metadata"
os.makedirs(OUTPUT_DIR, exist_ok=True)

csv_path = os.path.join(OUTPUT_DIR, "metadata.csv")
df.to_csv(csv_path, index=False)

gps_cols = ["gps_lat","gps_lat_ref","gps_lon","gps_lon_ref"]
public_df = df.drop(columns=[c for c in gps_cols if c in df.columns])

public_path = os.path.join(OUTPUT_DIR, "metadata_public.csv")
public_df.to_csv(public_path, index=False)

print("Saved:", os.path.abspath(csv_path))
print("Saved public:", os.path.abspath(public_path))
print("Rows:", len(df))


Saved: d:\SDSU\Data Analyst Study\travel-photo-analytics\data\metadata\metadata.csv
Saved public: d:\SDSU\Data Analyst Study\travel-photo-analytics\data\metadata\metadata_public.csv
Rows: 65


In [None]:
gps_cols = ["gps_lat","gps_lat_ref","gps_lon","gps_lon_ref"]
public_df = df.drop(columns=[c for c in gps_cols if c in df.columns])

public_path = os.path.join(OUTPUT_DIR, "metadata_public.csv")
public_df.to_csv(public_path, index=False)
print("Public CSV saved:", os.path.abspath(public_path))


Public CSV saved: d:\SDSU\Data Analyst Study\travel-photo-analytics\data\metadata\metadata_public.csv


In [9]:
df.columns.tolist()

['photo_path',
 'file_name',
 'file_ext',
 'timestamp_raw',
 'timestamp_source',
 'camera',
 'gps_lat',
 'gps_lat_ref',
 'gps_lon',
 'gps_lon_ref',
 'exif_present',
 'status',
 'error_message',
 'timestamp_parsed',
 'has_timestamp',
 'has_gps',
 'is_ok']