# Relevant documentation:
1. https://wiki.duraspace.org/display/FEDORA471/Fixity+Checking (v4.7.1 - GWSS-ETD)
2. https://wiki.duraspace.org/display/FEDORA46/Fixity+Checking (v4.6.0 - GWSS - unclear if this would work with GWSS)
3. https://wiki.duraspace.org/display/FEDORA4x/RESTful+HTTP+API 

In [None]:
import pysolr
import requests
import re
from requests.auth import HTTPBasicAuth

# Get list of FileSet IDs from Solr
Solr only indexes Fedora FileSets; it does not index the Files associated with these Filesets.

The FileSet records in Solr do include some metadata from the Files associated with the FileSets (like checksum); unfortunatately it doesn't include the File ID.

Solr can return the IDs of all FileSets, which we can take to Fedora to ask for the IDs of all the Files.

In [None]:
solr = pysolr.Solr('http://gwfedora-test2.wrlc.org:8983/solr/scholarspace')

In [None]:
# For testing purposes, the number of results is limited to 3. 
# In production, this should be a very large number like rows = 99999999.
results = solr.search('has_model_ssim:"FileSet"', rows = 3)
len(results)

# Pass the FileSet IDs to Fedora and make a list of corresponding File IDs
Excluding files that have type:ExtractedText

In [None]:
headers = {'Accept': 'application/ld+json'}
# Create an empty list to hold the file IDs.
files = []
# For each FileSet, get all of the linked Files and check to see if they are OCR text files. 
# If they aren't OCR text files, then add the File to our list.
for x in results:
    id=x['id']
    # Split up the ID into 2-digit units, to construct the API request url.
    idurl=re.findall('..',id)
    # Construct & get API request URL, to request the FileSet record from Fedora. Save to "fileset" variable.
    fileseturl='http://gwfedora-test2.wrlc.org:8080/fcrepo/rest/prod/'+idurl[0]+'/'+idurl[1]+'/'+idurl[2]+'/'+idurl[3]+'/'+id
    request = requests.get(fileseturl, auth=HTTPBasicAuth('fedoraAdmin', 'dummypassword'), headers=headers)
    fileset=request.json()
    # The FileSet record appears to be returned as a list of one item, so grab the first/only item.
    fileset=fileset[0]
    # Isolate the list of files linked to the fileset
    hasfile=fileset['http://pcdm.org/models#hasFile']

    # Within the Files associated with a FileSEt, there are original files, OCR text 
    # files, and possibly other types of files. More research should be done to 
    # determine what other types of files might exist. In the meantime, this script 
    # excludes type:ExtractedText files and performs fixity checks on everything else. 
    
    #For each File, check that it is type:ExtractedText, and if it isn't then add the ID/url to a list called "files"
    for y in hasfile:
        #Save the File ID
        file=y['@id']
        #Get the File record
        request = requests.get(file+'/fcr:metadata', auth=HTTPBasicAuth('fedoraAdmin', 'dummypassword'), headers=headers)
        request=request.json()
        #Save File's list of PCDM types 
        type=request[0]['@type']
        #If OriginalFile is in the list of types, then add this file to our list
        if any("ExtractedText" in s for s in type):
            pass
        else:
            files.append(file)

#Report the number of files for the filesets.
len(files)


# Run fixity checks on Files

In [None]:
#For all our files in the list, run a fixity check
for x in files:
    url = x + '/fcr:fixity'
    fixity = requests.get(url, auth=HTTPBasicAuth('fedoraAdmin', 'dummypassword'), headers=headers)


In [None]:
#Test outcome: view the last fixity check
fixity.json()

In [1]:
#Parse results to check for outcome


In [2]:
#Report results
