In [1]:
import contextlib
import tarfile
from http.client import HTTPSConnection
import shutil    
import os
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# GeoPose3k
**Dataset of georeferenced mountain landscapes**  
[presentation](https://cphoto.fit.vutbr.cz/geoPose3K/)  
[paper](https://cphoto.fit.vutbr.cz/geoPose3K/data/geoPose3K_submission.pdf)  
Number of viewpoints: 3000  
Available data: camera location and orientation, depth map, images from DEM  
Coverage: Alps (use EPSG:2056)

In [2]:
datadir = '../../data/geopose3k/'

Stream data and filter useful files for next steps

In [25]:
# from https://stackoverflow.com/questions/51242749/download-and-extract-a-tar-file-in-python-in-chunks/68306109#68306109
def https_download_tar(host, path, item_visitor, port=443, headers=dict({}), compression='gz'):
    """Download and unpack tar file on-the-fly and call item_visitor for each entry.

        item_visitor will receive the arguments TarFile (the currently extracted stream) and the current TarInfo object
    """
    with contextlib.closing(HTTPSConnection(host=host, port=port)) as client:
        client.request('GET', path, headers=headers)
        with client.getresponse() as response:
            code = response.getcode()
            if code < 200 or code >= 300:
                raise Exception(f'HTTP error downloading tar: code: {code}')
            try:
                with tarfile.open(fileobj=response, mode=f'r|{compression}') as tar:
                    for tarinfo in tar:
                        item_visitor(tar, tarinfo)
            except Exception as e:
                raise Exception(f'Failed to extract tar stream: {e}')

# display files names and infos
def list_entry(tar, tarinfo):
    print(f'{tarinfo.name}\t{"DIR" if tarinfo.isdir() else "FILE"}\t{tarinfo.size}\t{tarinfo.mtime}')

# download only requested files
def filter_entry(tar, tarinfo):
    files_to_keep = ['info.txt', 'license', 'photo']
    if any([f in tarinfo.name for f in files_to_keep]):
        buffer = tar.extractfile(tarinfo)
        path = tarinfo.name.split('/')
        filedir = path[1]+'/'
        filename = path[-1]
        if not os.path.exists(datadir+filedir):
            os.mkdir(datadir+filedir)
        with open(datadir+filedir+filename, 'wb') as file:
            shutil.copyfileobj(buffer, file)

In [None]:
https_download_tar('merlin.fit.vutbr.cz', '/elevation/geoPose3K_final_publish.tar.gz', filter_entry)

Gather the viewpoints data

In [3]:
info_df = pd.DataFrame(columns=['latitude', 'longitude', 'elevation', 'yaw', 'pitch', 'roll', 'fov'])

# scan directories
for entry in os.scandir(datadir):
    filedir = entry.name + '/'
    # read info.txt lines
    with open(datadir+filedir+'info.txt', 'r') as info:
        lines = info.readlines()
    orientation = lines[1].split(' ')
    # parse data as dict
    row = {
        'latitude': lines[2],
        'longitude': lines[3],
        'elevation': lines[4],
        'yaw': orientation[0],
        'pitch': orientation[1],
        'roll': orientation[2],
        'fov': lines[5],
    }
    # convert str to float
    row = {k: float(v.strip()) for k,v in row.items()}
    # convert dict to df
    row_df =  pd.DataFrame.from_records(row, index=[entry.name])
    # append new row to info df
    info_df = pd.concat([info_df, row_df])

  info_df = pd.concat([info_df, row_df])


In [4]:
info_df

Unnamed: 0,latitude,longitude,elevation,yaw,pitch,roll,fov
28488116812_f5a57ca0f6_k,46.2173,10.16630,439.5,-0.272811,0.243122,1.481680e-02,0.549165
28561570606,46.3463,6.84551,1407.0,-0.026613,0.087538,-1.331180e-02,1.153100
eth_ch1_04032011388_01024,46.8600,9.83700,2246.5,-1.329290,-0.074533,2.597480e-02,0.984675
eth_ch1_04032011389_01024,46.8600,9.83700,2246.5,-2.009740,-0.032493,3.238110e-02,0.983756
eth_ch1_04032011390_01024,46.8600,9.83700,2246.5,-2.463600,-0.015115,3.083040e-02,0.983893
...,...,...,...,...,...,...,...
eth_ch1__dsc0074_01024,46.9120,7.45199,768.5,2.306340,0.031447,5.142460e-07,0.184094
finsteraarhorn01,46.5373,8.12605,4248.0,-2.441830,-0.065799,3.216440e-02,0.210615
finsteraarhorn02,46.5373,8.12605,4248.0,1.139340,-0.057906,-2.635630e-02,0.270635
finsteraarhorn03,46.5373,8.12605,4248.0,1.102720,-0.094248,-1.800390e-02,0.637194


In [6]:
# save csv
info_df.to_csv(datadir+'infos.csv', index_label='id')