In [None]:
# NOTE
# Image embeddings have already been generated and saved.
# This notebook is kept for reproducibility and explanation.
# Do not re-run embedding extraction cells.

In [2]:
import os
print(os.getcwd())

C:\Users\Sameer Kumar\Documents\GitHub\Multimodal-Model-for-House-Price-Prediction\notebooks


In [1]:
IMAGE_DIR = "../images"

print("Folders:", len(os.listdir(IMAGE_DIR)))
first = os.listdir(IMAGE_DIR)[0]
print("Sample Folder:", first)
print("Files inside:", os.listdir(os.path.join(IMAGE_DIR, first))[:5])

Folders: 14599
Sample Folder: 1000102
Files inside: ['a874fd2b93d9e64768c8471e8ff5f94b.jpg']


In [2]:
import os, pickle, torch
from PIL import Image
from tqdm import tqdm
from torchvision import models, transforms

IMAGE_ROOT = "../images"

# Load model
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
model.fc = torch.nn.Identity()
model.eval()

transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485,0.456,0.406],
        std=[0.229,0.224,0.225]
    )
])

embeddings = {}
fail = 0

property_folders = os.listdir(IMAGE_ROOT)

for prop_id in tqdm(property_folders, desc="Extracting embeddings"):
    prop_path = os.path.join(IMAGE_ROOT, prop_id)
    if not os.path.isdir(prop_path):
        continue

    # go INSIDE the hash folder
    subfolders = os.listdir(prop_path)
    if len(subfolders) == 0:
        continue

    hash_folder = os.path.join(prop_path, subfolders[0])
    if not os.path.isdir(hash_folder):
        continue

    # find jpg inside hash folder
    jpgs = [f for f in os.listdir(hash_folder) if f.lower().endswith(".jpg")]
    if not jpgs:
        continue

    img_path = os.path.join(hash_folder, jpgs[0])

    try:
        img = Image.open(img_path).convert("RGB")
        img = transform(img).unsqueeze(0)

        with torch.no_grad():
            vec = model(img).squeeze().numpy()

        embeddings[prop_id] = vec

    except Exception as e:
        fail += 1

print("DONE!")
print("Total embeddings:", len(embeddings))
print("Failed images:", fail)

with open("../models/image_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

Extracting embeddings: 100%|█████████████████████████████████████████████████████| 14599/14599 [16:41<00:00, 14.58it/s]


DONE!
Total embeddings: 14599
Failed images: 0
