# DickensAssignmentValidator

MUCEP Task 1 (Dr. Pierre-Paul Bitton)

Author: Shawon Ibn Kamal\
Email: sikamal@mun.ca

### Updates to exisiting program

I made a few changes in the existing curator program to work with it efficiently and sorted out a few bugs. Here's the list:

- The files getting read from is renamed to "DataFiles" from "Files".
- The output csv files are being stored in a folder named "OutputFiles".
- Renamed "NotMatched.csv" to "MissingMeta.csv" in order to avoid confusion with "MissingFiles.csv".
- Stored the program in git, currently a private repo to me. I think it is a good way to track updates,\
  we can work on it if you are interested
- Fixed few minor bugs in DickensAssignment.py program


In [205]:
import pandas as pd

### Run DickensAssignment.py



In [206]:
exec(open('DickensAssignment.py').read())

4905 no. of files
4093 match found
812 match not found
Complete


### Compare OutputFiles with OutputFiles_2020_07_14

In [207]:
# Load old outputs
df_old_result = pd.read_csv('OutputFiles_2020_07_14/Result.csv', engine='python')
df_old_missing_files = pd.read_csv('OutputFiles_2020_07_14/MissingFiles.csv', engine='python')
df_old_not_matched_files = pd.read_csv('OutputFiles_2020_07_14/MissingMeta.csv', engine='python')

# Load new outputs
df_new_result = pd.read_csv('OutputFiles_2020_07_14/Result.csv', engine='python')
df_new_missing_files = pd.read_csv('OutputFiles_2020_07_14/MissingFiles.csv', engine='python')
df_new_missing_meta = pd.read_csv('OutputFiles_2020_07_14/MissingMeta.csv', engine='python')

# Load filenames
filenames = [name for path, subdirs, files in os.walk("DataFiles")
             for name in files]

df_data_files = pd.DataFrame({'filename':filenames}).sort_values(by='filename')

# Load template
df_template = pd.read_csv('template.csv', engine='python')

# Sort Result
df_new_result = df_new_result.sort_values(by='FileName')


In [208]:
df_diff_result = pd.concat([df_old_result,df_new_result]).drop_duplicates(keep=False)
df_diff_missing_files = pd.concat([df_old_missing_files,df_new_missing_files]).drop_duplicates(keep=False)
df_diff_not_matched_files = pd.concat([df_old_not_matched_files,df_new_not_matched_files]).drop_duplicates(keep=False)

if (df_diff_result.size == 0):
    print("Results are the same")
else:
    print("Results have ", df_diff_result.size, " differences")
    
if (df_diff_missing_files.size == 0):
    print("MissingFiles are the same")
else:
    print("MissingFiles hav ", df_diff_missing_files.size, " differences")

if (df_new_missing_meta.size == 0):
    print("NotMatchedFiles are the same")
else:
    print("NotMatchedFiles have ", df_new_missing_meta.size, " differences")


Results are the same
MissingFiles are the same
NotMatchedFiles have  812  differences


### Check to see if MissingMetaData entries are due to typo

In [209]:
def includes(fullstring, substrings=[]):
    count = 0
    for each_substring in substrings:
        if fullstring.find(each_substring) != -1:
            count += 1
    return count

# Testing
print(includes("I like data", ["like", "data"]))

2


In [210]:
df_template['key'] = 0
df_new_missing_meta['key'] = 0

# Cartessian product of two dataframes
df_merged_template_and_missing_meta = df_template.merge(df_new_missing_meta, how='outer')

In [211]:
df_merged_template_and_missing_meta['similarity'] = df_merged_template_and_missing_meta.apply(lambda row : includes(row['notmatched'], [str(row['catalogueNumber']), row['institutionCode']]), axis=1)

In [265]:
df_merged_template_and_missing_meta = df_merged_template_and_missing_meta[['institutionCode', 'catalogueNumber', 'notmatched', 'similarity']][df_merged_template_and_missing_meta['similarity'] > 1].sort_values(by='notmatched', ascending=False)
print(df_merged_template_and_missing_meta['notmatched'].count())

# Export data
df_merged_template_and_missing_meta.to_csv('ValidatorExports/MissingMetaSimilar.csv', index=False)

# Print first 50 data
df_merged_template_and_missing_meta.head(10)

27


Unnamed: 0,institutionCode,catalogueNumber,notmatched,similarity
43377,MNRJ,4359,CH.R.MNRJ44359.00000005.csv,2
43376,MNRJ,4359,CH.R.MNRJ44359.00000002.csv,2
23167,CM,72696,AM.U.CM972696.00000005.Master.Transmission,2
23166,CM,72696,AM.U.CM972696.00000004.Master.Transmission,2
23165,CM,72696,AM.U.CM972696.00000003.Master.Transmission,2
23164,CM,72696,AM.U.CM972696.00000002.Master.Transmission,2
23163,CM,72696,AM.U.CM972696.00000001.Master.Transmission,2
23162,CM,72696,AM.T.CM972696.00000005.Master.Transmission,2
23161,CM,72696,AM.T.CM972696.00000004.Master.Transmission,2
23160,CM,72696,AM.T.CM972696.00000003.Master.Transmission,2


In [267]:
# Find no similarities at all
df_merged_template_and_missing_meta = df_merged_template_and_missing_meta.drop_duplicates('notmatched')
df_missing_meta_nonsimilar = pd.concat([df_new_missing_meta['notmatched'],df_merged_template_and_missing_meta['notmatched']]).drop_duplicates(keep=False)

# Export non-similar data
df_missing_meta_nonsimilar.to_csv('ValidatorExports/MissingMetaNonSimilar.csv', index=False, header=True)

# Print first 50 data
df_missing_meta_nonsimilar.head(10)

0    TE.F.B.LSU180686.00000001.Master.Transmission
1    TE.F.B.LSU180686.00000002.Master.Transmission
2    TE.F.B.LSU180686.00000003.Master.Transmission
3    TE.F.B.LSU180686.00000004.Master.Transmission
4    TE.F.B.LSU180686.00000005.Master.Transmission
5    TE.F.B.LSU180687.00000001.Master.Transmission
6    TE.F.B.LSU180687.00000002.Master.Transmission
7    TE.F.B.LSU180687.00000003.Master.Transmission
8    TE.F.B.LSU180687.00000004.Master.Transmission
9    TE.F.B.LSU180687.00000005.Master.Transmission
Name: notmatched, dtype: object

### Find similar files for MissingFiles

In [282]:
df_data_files['key'] = 0
df_new_missing_files['key'] = 0

# Cartessian product of two dataframes
df_merged_data_files_and_missing_files = df_data_files.merge(df_new_missing_files, how='outer')
df_merged_data_files_and_missing_files.head()

Unnamed: 0,filename,key,FileName,institutionCode,collectionCode,catalogueNumber,class,order,family,genus,...,verbatimElevation,eventDate,measurementDeterminedDate,Patch,LightAngle1,LightAngle2,ProbeAngle1,ProbeAngle2,Replicate,Comments
0,AM.H.AMNH278606.00000001.Master.Transmission,0,,MZUSP,,97287,Aves,Trogoniformes,Trogonidae,Trogon,...,,2013-6-26,,,0,0,0,0,,
1,AM.H.AMNH278606.00000001.Master.Transmission,0,,MZUSP,,76792,Aves,Trogoniformes,Trogonidae,Trogon,...,,2007-1-20,,,0,0,0,0,,
2,AM.H.AMNH278606.00000001.Master.Transmission,0,,MZUSP,,86474,Aves,Trogoniformes,Trogonidae,Trogon,...,,2009-7-16,,,0,0,0,0,,
3,AM.H.AMNH278606.00000001.Master.Transmission,0,,MCZ,,173836,Aves,Trogoniformes,Trogonidae,Trogon,...,,1932-3-4,,,0,0,0,0,,
4,AM.H.AMNH278606.00000001.Master.Transmission,0,,MZUSP,,15953,Aves,Trogoniformes,Trogonidae,Trogon,...,,1939-3-19,,,0,0,0,0,,


In [283]:
# Calculate similarity
df_merged_data_files_and_missing_files['similarity'] = df_merged_data_files_and_missing_files.apply(lambda row : includes(row['filename'], [str(row['catalogueNumber'])]), axis=1)

In [284]:
# Sort
df_merged_data_files_and_missing_files = df_merged_data_files_and_missing_files[['institutionCode', 'catalogueNumber', 'filename', 'similarity']][df_merged_data_files_and_missing_files['similarity'] > 0].sort_values(by='institutionCode', ascending=False).reset_index()
print(df_new_missing_files.shape[0])
print("Length of similarities", df_merged_data_files_and_missing_files.shape[0])

# Export data
df_merged_data_files_and_missing_files.to_csv('ValidatorExports/MissingFilesSimilarity.csv', index=False)

# Print first 50 data
df_merged_data_files_and_missing_files.head(10)

64
Length of similarities 595


Unnamed: 0,index,institutionCode,catalogueNumber,filename,similarity
0,222894,LSUMNS,114719,SU.S.LSU114719.00000003.Master.Transmission,1
1,243312,LSUMNS,161602,SU.U.LSU161602.00000002.Master.Transmission,1
2,243440,LSUMNS,161602,SU.U.LSU161602.00000004.Master.Transmission,1
3,243504,LSUMNS,161602,SU.U.LSU161602.00000005.Master.Transmission,1
4,243572,LSUMNS,71304,SU.U.LSU71304.00000001.Master.Transmission,1
5,243636,LSUMNS,71304,SU.U.LSU71304.00000002.Master.Transmission,1
6,243700,LSUMNS,71304,SU.U.LSU71304.00000003.Master.Transmission,1
7,243764,LSUMNS,71304,SU.U.LSU71304.00000004.Master.Transmission,1
8,243828,LSUMNS,71304,SU.U.LSU71304.00000005.Master.Transmission,1
9,243883,LSUMNS,87590,SU.U.LSU87590.00000001.Master.Transmission,1


In [280]:
# Find no similarities at all
df_merged_data_files_and_missing_files_unique = df_merged_data_files_and_missing_files.drop_duplicates(subset=['institutionCode', 'catalogueNumber'])
df_missing_files_nonsimilar = pd.concat([df_new_missing_files[['institutionCode', 'catalogueNumber']],df_merged_data_files_and_missing_files_unique[['institutionCode', 'catalogueNumber']]]).drop_duplicates(keep=False)

# Export non-similar data
df_missing_files_nonsimilar.to_csv('ValidatorExports/MissingFilesNonSimilar.csv', index=False, header=True)

print("There are ", df_missing_files_nonsimilar.shape[0], " meta data with no similarities.")

# Print first 50 data
df_missing_files_nonsimilar.head(10)

There are  43  meta data with no similarities.


Unnamed: 0,institutionCode,catalogueNumber
0,MZUSP,97287
1,MZUSP,76792
2,MZUSP,86474
3,MCZ,173836
4,MZUSP,15953
5,MZUSP,44168
6,MZUSP,44172
7,MZUSP,44175
8,MCZ,173842
9,MCZ,173839
