In [None]:
import random

from matplotlib import pyplot as plt
from mpl_toolkits import axes_grid1
import numpy as np

import tensorflow as tf
from tensorflow import keras

import pandas as pd
import os
import shutil
from tqdm import tqdm

first download dataset from https://www.kaggle.com/datasets/amaralibey/gsv-cities

# Load in DataFrames

In [None]:
df = pd.DataFrame()
path_to_data = "../GSVData/Dataframes"
for csv in os.listdir(path_to_data):
    if df.empty:
        df = pd.read_csv(os.path.join(path_to_data,csv))
    else:
        temp = pd.read_csv(os.path.join(path_to_data, csv))
        df = pd.concat([df, temp])

take only the placeids with >= 4 photos

In [None]:
gte_4 = df[df.groupby(['city_id','place_id'])['place_id'].transform('size') >= 4]
unique_ids = gte_4[['city_id', 'place_id']].value_counts()

# Randomly split into train and val (.75 place id images in train, .25 in val)

don't run this cell if you want to replicate our dataset 

In [None]:
train_df = pd.DataFrame(columns=['place_id', 'year', 'month', 'northdeg', 'city_id', 'lat', 'lon',
       'panoid'])
test_df = pd.DataFrame(columns=['place_id', 'year', 'month', 'northdeg', 'city_id', 'lat', 'lon',
       'panoid'])
for city, place in tqdm(unique_ids.index):
    temp_train = gte_4[(gte_4['city_id'] == city) & (gte_4['place_id'] == place)].sample(frac=.75)
    temp_test = pd.concat([gte_4[(gte_4['city_id'] == city) & (gte_4['place_id'] == place)],temp_train]).drop_duplicates(keep=False)
    train_df = pd.concat([train_df, temp_train])
    test_df = pd.concat([test_df, temp_test])

train_df.to_csv("train_df.csv")
test_df.to_csv("test_df.csv")

run the following after using our dataframes for your dataset

In [None]:
#train_df = pd.read_csv("train_df.csv")
#test_df = pd.read_csv("test_df.csv")

In [None]:
not_moved = []

for idx, img in test_df.iterrows():
    path = img['city_id'] + '_' + str(img['place_id']).zfill(7) + '_' + str(img['year']) \
    + '_' + str(img['month']).zfill(2) + '_' + str(img['northdeg']).zfill(3) + '_' + str(img['lat']) + '_' \
    + str(img['lon']) + '_' + img['panoid'] + '.jpg'
    new_path = os.path.join("../GSVData", "Test", img['city_id'], path)
    old_path = os.path.join("../GSVData", "Images", img['city_id'], path)

    if os.path.exists(old_path):
        shutil.move(old_path, new_path)
    else:
        not_moved.append(old_path)

os.rename('..GSVData/Images', 'GSVData/Train')

# Creating Custom Batches 

The purpose of the following code is to be able to create custom batches where images from the same place id are in the same batch. We want to do this to give our CNN the best possible chance of extracting relevant feature for each place in the same batch. As of now, the code does not completely work, we will revisit this later.

In [None]:
def create_pathnames(img):
    path = img['city_id'] + '_' + str(img['place_id']).zfill(7) + '_' + str(img['year']) \
    + '_' + str(img['month']).zfill(2) + '_' + str(img['northdeg']).zfill(3) + '_' + str(img['lat']) + '_' \
    + str(img['lon']) + '_' + img['panoid'] + '.jpg'
    return os.path.join("../GSVData", "Train", img['city_id'], path)

In [None]:
train_df['path'] = train_df.apply(lambda x: create_pathnames(x), axis=1)

In [None]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
train_df['encoded_city'] = label_encoder.fit_transform(train_df['city_id']) 

In [None]:
train_df = train_df.sample(frac=1)

In [None]:
train_grouped = train_df.groupby(['city_id', 'place_id'], sort=False)

In [None]:
total = 0
batchesn = []
batches = []
batch_size = 256
curr_batch = 0
batchn = []
batch = []
batch_lens = []
labels = []
label_b = []
for group_keys, group_data in train_grouped:
    if curr_batch + group_data.shape[0] <= batch_size:
        batchn.append(group_keys)
        batch += list(group_data['path'])
        label_b += list(group_data['encoded_city'])
        curr_batch += group_data.shape[0]
    else:
        batch_lens.append(curr_batch)
        batchesn.append(batchn)
        batches.append(batch)
        labels.append(label_b)
        batchn = [group_keys]
        batch = list(group_data['path'])
        label_b = list(group_data['encoded_city'])
        curr_batch = group_data.shape[0]
    total += group_data.shape[0]
batches.append(batch)

In [None]:
def read_and_decode_image(image_path):
    # Read the image file
    image = tf.io.read_file(image_path)
    # Decode the image
    image = tf.image.decode_jpeg(image, channels=3) 
    # Preprocess the image (resize, normalize, etc.)
    image = preprocess_image(image) 
    return image
    
def preprocess_image(image):
    image = tf.image.resize(image, [128, 96])
    image = image / 255.0  
    return image

# Function to load a batch of images
def load_batch(batch_paths, labels):
    # Create a dataset from the list of paths
    dataset = tf.data.Dataset.from_tensor_slices((batch_paths, labels))
    # Map the dataset to read and decode images
    dataset = dataset.map(lambda x, y: (read_and_decode_image(x), y)) #, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset

# List to hold datasets for each batch
batch_datasets = []

# Load each batch of images into a TensorFlow dataset
for batch_paths, l in tqdm(zip(batches, labels)):
    batch_dataset = load_batch(batch_paths, l)
    ts = [tensor for tensor, _ in batch_dataset]
    ls = [_ for tensor, _ in batch_dataset]
    batch_dataset = (tf.stack(ts, axis=0), tf.stack(ls, axis=0))
    batch_datasets.append(batch_dataset)
    del batch_dataset

In [None]:
combined_dataset = batch_datasets[0]
for dataset in batch_datasets[1:]:
    combined_dataset = combined_dataset.concatenate(dataset)

# Convert the combined dataset to a MapDataset
map_dataset = combined_dataset.map(lambda x, y: (x, y))

Because the custom batching is not yet functional, we will use tensorflows built in dataloader tf.keras.preprocessing.image_dataset_from_directory