# Scanned Map Checker V2

This is for checking scans at the AGSL.

## Things to Check:

* Check for missing scans by checking the scan numbers
* Check for improper suffix
     * 002 but no 001 or other gaps in sets
     * s2 but no s1 and/or full image
     * b with no a
     * wrong number of digits in scan number
     * _d with no _c

Process:
1. Define a directory to search
1. Is search recursive?
1. When error is found
    * Print type of error
    * What is exepcted, what was found
    * Full path to error
1. Log errors to file so they don't get lost

TODO:

[ ] Define types of errors
[ ] Use regular expression for pattern matching


In [None]:
import re
from pathlib import Path

In [87]:
# Define the directory to search and determine if it's recursive
RECURSIVE = False
SEARCH_PATH = Path(r'S:\_R_GML_Archival_AGSL\Image_Archive\am - Maps')

print(SEARCH_PATH)

S:\_R_GML_Archival_AGSL\Image_Archive\am - Maps


In [88]:
# List files and dirs in a directory 
def listFiles(path):
    if RECURSIVE: p = path.rglob('*')
    else: p = path.glob('*')
    files = []
    dirs = []
    for x in p:
        if x.is_file():
            files.append(x)
        elif x.is_dir():
            dirs.append(x)

    return files, dirs

file_list, dir_list = listFiles(SEARCH_PATH)

In [93]:
print("Dirs found:", len(dir_list))
print("Files Found:", len(file_list))
# for file in file_list: print(file)
# for dir in dir_list: print(dir)

Dirs found: 33
Files Found: 1


In [None]:
class re_string(dict):
    agsmap_scan_number: str = '(agsmap|am)(\d{6})(_?.*)(\.tif|\.jpg)' # If it doesn't match this, it's not properly formed. Check for special case.
    # Group 1 is prefix am or agsmap
    # Group 2 is 6 digit scan number
    # Group 3 is all suffixes
    # Group 4 is the file extension if .tif or .jpg

    # Standardized suffixes
    sheet: str = r'_\d{3}' # Always 3 digits
    copy: str = r'_c\d+' # usually one digit, but could be more like _c2, _c3, ..., _c15
    side: str = r'_\w(?=_|\b)' # Positive lookahead assertion that the single char is followed by _ or \b (word boundary, includes '.' from .tif) (see courage, reduced, and badscan special case)
    stitch: str = r'_s\d+' # Always 's' followed by int
    
    # Warning Cases
    courage: str = r'(c(?!=\d+))' # 'c' only (This is *ocassionally* used as a "third" side, e.g. _a, _b, _c, _d)
    reduced: str = r'(_d(?!=\d+))' # '_d' only (This indicates a "reduced size" but see above)
    badscan: str = r'(b(?!=\d+))' # 'b' only (This indicates a "bad scan" in 'am000000' system but see 2 above)
    legacy: str = r'(?<!_)[ad-z](?!=\d+)' # This will catch any "all suffixes" string that is one lowercase char other than 'b' or 'c' common in 'am000000' system.
    duplicate: str = r'(\(\d\))' # Likely a windows-generated filename e.g. `agsmap030406 (2).tif`, indicates improper image overwrite.
    sg: str = r'_sg' # Scanned with the SG scanner

In [None]:
class suffixSet(dict):
    def __init__(self, suffix_string):
        assert suffix_string.__class__ == str
        assert suffix_string != ""
        self.suffixes = suffix_string

        def suf(regex):
            if re.search(regex, self.suffixes) is not None:
                return re.search(regex, self.suffixes)[0]
            else:
                return None

        self.sheet = suf(re_string.sheet)
        self.copy = suf(re_string.copy)
        self.side = suf(re_string.side)
        self.stitch = suf(re_string.stitch)
        self.courage = suf(re_string.courage)
        self.reduced = suf(re_string.reduced)
        self.badscan = suf(re_string.badscan)
        self.legacy = suf(re_string.legacy)
        self.duplicate = suf(re_string.duplicate)
        self.sg = suf(re_string.sg)
    


In [None]:
class scanFile(dict):
    def __init__(self, filename):
        assert filename.__class__ == str
        
        result = re.search(re_string.agsmap_scan_number, filename)
        if result[3] is None:
            self.suffix_set = None
        elif result[3] == '':
            self.suffix_set = None
        else:
            self.suffix_set = suffixSet(result[3])

In [94]:
def scan_number_search(file):
    '''Checks a file Path .name to see if it matches the scan numbe regex'''
    result = re.search(re_string.agsmap_scan_number, file.name)
    if result is not None: return file.name

In [95]:
rejects = []
for file in file_list:
    if not scan_number_search(file) is None:
        scan_object = scanFile(file.name)
        if scan_object.suffix_set is not None:
            print(file.name, "->", scan_object.suffix_set.suffixes)
        else:
            print(file.name, "-> None")
    else:
        rejects.append(file)



In [96]:
print(rejects)

[WindowsPath('S:/_R_GML_Archival_AGSL/Image_Archive/am - Maps/image archive am maps readme.txt')]
