# Abstract
We are going to reorganize our dataset's files to allow for easier importing and organizing when working with tensorflow. 

# Import and helper functions

In [13]:
import os
import pandas as pd
import zipfile
import shutil

In [14]:
def createDir(path):
    try:
        os.mkdir(path)
    except:
        print(path, 'directory already exists')

# Extract our compressed data

In [4]:
# This cell extracts the compressed files into a dedicated directory
# NOTE: Expected runtime for this cell 1-2 min

extractedDirectory = 'extractedData'
with zipfile.ZipFile('hirise-map-proj-v3_2.zip', 'r') as zippedData:
    zippedData.extractall(path=extractedDirectory)

# Read the data map files
As part of the extracted raw data, mapping data as `.txt` files are also provided which define what class each image belongs to as well as which subset (training/validation/testing) they belong to.

This section reads these files to create a data frame which provides context for images in the dataset.

In [5]:
# Define our raw data path and the main images folder
rawDataDir = '/'.join([extractedDirectory,'hirise-map-proj-v3_2'])
imagesDir = 'images'

print(rawDataDir)
print(imagesDir)

extractedData/hirise-map-proj-v3_2
images


In [16]:
classDecoder = pd.read_csv(
    '/'.join([rawDataDir,'landmarks_map-proj-v3_2_classmap.csv']),
    header= None,
    names = ['class','className']
)
classDecoder

Unnamed: 0,class,className
0,0,other
1,1,crater
2,2,dark dune
3,3,slope streak
4,4,bright dune
5,5,impact ejecta
6,6,swiss cheese
7,7,spider


This dataframe will act as a decoder by giving us the relation between the numerical label assigned to every image and the associated human readable text for it.

In [15]:
imageMap = pd.read_csv(
    '/'.join([rawDataDir,"labels-map-proj_v3_2_train_val_test.txt"]), 
    header = None,
    names = ['imageName','class','imageSet'],
    delimiter = ' '    
)
imageMap

Unnamed: 0,imageName,class,imageSet
0,ESP_013049_0950_RED-0067.jpg,7,train
1,ESP_013049_0950_RED-0067-fv.jpg,7,train
2,ESP_013049_0950_RED-0067-brt.jpg,7,train
3,ESP_013049_0950_RED-0067-r90.jpg,7,train
4,ESP_013049_0950_RED-0067-r180.jpg,7,train
...,...,...,...
67805,ESP_018707_2205_RED-0041.jpg,0,test
67806,ESP_018707_2205_RED-0062.jpg,0,test
67807,ESP_018707_2205_RED-0105.jpg,0,test
67808,ESP_018707_2205_RED-0058.jpg,0,test


This dataframe shows us the label assigned to each image (as a number). This dataframe has a few duplicate records. Accounting for this and referencing our `classDecoder`, we get the following dataframe

In [19]:
imageMap = imageMap.merge(classMap,on='class').drop_duplicates()
imageMap

Unnamed: 0,imageName,class,imageSet,className_x,className_y
0,ESP_013049_0950_RED-0067.jpg,7,train,spider,spider
1,ESP_013049_0950_RED-0067-fv.jpg,7,train,spider,spider
2,ESP_013049_0950_RED-0067-brt.jpg,7,train,spider,spider
3,ESP_013049_0950_RED-0067-r90.jpg,7,train,spider,spider
4,ESP_013049_0950_RED-0067-r180.jpg,7,train,spider,spider
...,...,...,...,...,...
64942,ESP_016613_2570_RED-0038.jpg,4,test,bright dune,bright dune
64943,ESP_016613_2570_RED-0006.jpg,4,test,bright dune,bright dune
64944,ESP_016613_2570_RED-0043.jpg,4,test,bright dune,bright dune
64945,ESP_016613_2570_RED-0044.jpg,4,test,bright dune,bright dune


This dataframe now shows, what label as well as which part of the data split each image is assigned to. Using this we can reorganize our image data accordingly.

We will now create the following folder structure:


In [8]:
# Create Folder structure for image data
for imageFolder in set(imageMap.imageSet):
    createDir('/'.join([imagesDir,imageFolder]))
    for classFolder in set(classMap.className):       
        createDir('/'.join([imagesDir,imageFolder,classFolder]))

In [11]:
# Move all extracted images into the appropriate folder location
# Expected runtime for this operation 6-10 min
for file in os.listdir('/'.join([rawDataDir,'map-proj-v3_2'])):
    
    setFolder = imageMap[imageMap.imageName == file].iloc[0]['imageSet']
    classFolder = imageMap[imageMap.imageName == file].iloc[0]['className'] 
    
    srcPath  = '/'.join([rawDataDir,'map-proj-v3_2',file])
    destPath = str('images/' +setFolder + "/" + classFolder + "/" + file)
    
    shutil.move(srcPath, 
                    destPath)
#     os.rename(srcPath,
#              destPath)