In [1]:
import sys
sys.path.append('..')

In [2]:
from pathlib import Path 
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
DATA = Path('..') / 'data'
RAW = DATA / '00_raw'
PRUNED = DATA / '01_pruned'

CHUNK = 1_000_000

## Plants_gbif_idigbio

In [4]:
in_file = RAW / 'Plants_gbif_idigbio.csv.gz'
out_file = PRUNED / 'Plants_gbif_idigbio.csv.gz'

In [5]:
headers = !zcat "$in_file" | head -1
headers = headers[0].split('\t')
headers

['gbifID',
 'datasetKey',
 'occurrenceID',
 'kingdom',
 'phylum',
 'class',
 'order',
 'family',
 'genus',
 'species',
 'infraspecificEpithet',
 'taxonRank',
 'scientificName',
 'verbatimScientificName',
 'verbatimScientificNameAuthorship',
 'countryCode',
 'locality',
 'stateProvince',
 'occurrenceStatus',
 'individualCount',
 'publishingOrgKey',
 'decimalLatitude',
 'decimalLongitude',
 'coordinateUncertaintyInMeters',
 'coordinatePrecision',
 'elevation',
 'elevationAccuracy',
 'depth',
 'depthAccuracy',
 'eventDate',
 'day',
 'month',
 'year',
 'taxonKey',
 'speciesKey',
 'basisOfRecord',
 'institutionCode',
 'collectionCode',
 'catalogNumber',
 'recordNumber',
 'identifiedBy',
 'dateIdentified',
 'license',
 'rightsHolder',
 'recordedBy',
 'typeStatus',
 'establishmentMeans',
 'lastInterpreted',
 'mediaType',
 'issue']

In [6]:
df = pd.read_csv(
    in_file, sep='\t', dtype=str, keep_default_na=False,
    usecols=['gbifID',
             'decimalLatitude', 'decimalLongitude',
             'coordinateUncertaintyInMeters',
             'locality',
             'stateProvince', 'countryCode'])

df = df.rename(columns={
    'gbifID': 'gbifid',
    'decimalLatitude': 'lat',
    'decimalLongitude': 'lng',
    'coordinateUncertaintyInMeters': 'uncert',
    'locality': 'locality',
    'countryCode': 'country',
    'stateProvince': 'state',
})

df.head()

Unnamed: 0,gbifid,occurrenceID,country,locality,state,lat,lng,uncert
0,1851020318,03F687C9FA26B47AFF276CF61275C90A.mc.3B373C82FA...,CF,Mabea Bai,Sangha-Mbaere,3.0335,16.4095,
1,1851020333,03F687C9FA26B47AFF276CF61275C90A.mc.3B373C82FA...,KE,Reserve de la Moukalaba-Dougoua,,-2.227167,10.3945,
2,1851020156,03F687C9FA26B47AFF276CF61275C90A.mc.3B373C82FA...,CD,North Kivu Province,North Kivu Province,,,
3,1851020334,03F687C9FA2AB478FF27695C1427C99C.mc.3B373C82FA...,ZA,Pretoria,Mpumalanga,-23.42,30.06,
4,1851020169,03F687C9FA2FB473FF276B2612A0CC8F.mc.3B373C82FA...,ZA,Ellistras Dist.,South,-23.58,27.45,


In [7]:
df.to_csv(out_file, index=False)

## insects_idigbio/occurrence_raw

In [8]:
in_file = RAW / 'insects_idigbio' / 'occurrence_raw.csv.gz'
out_file = PRUNED / 'insects_idigbio_occurrence_raw.csv.gz'

In [9]:
headers = !zcat "$in_file" | head -1
headers = headers[0].split(',')
headers

['coreid',
 'aec:associatedTaxa',
 'dc:rights',
 'dcterms:accessRights',
 'dcterms:bibliographicCitation',
 'dcterms:language',
 'dcterms:license',
 'dcterms:modified',
 'dcterms:references',
 'dcterms:rights',
 'dcterms:rightsHolder',
 'dcterms:source',
 'dcterms:type',
 'dwc:Identification',
 'dwc:MeasurementOrFact',
 'dwc:ResourceRelationship',
 'dwc:VerbatimEventDate',
 'dwc:acceptedNameUsage',
 'dwc:acceptedNameUsageID',
 'dwc:accessRights',
 'dwc:associatedMedia',
 'dwc:associatedOccurrences',
 'dwc:associatedOrganisms',
 'dwc:associatedReferences',
 'dwc:associatedSequences',
 'dwc:associatedTaxa',
 'dwc:basisOfRecord',
 'dwc:bed',
 'dwc:behavior',
 'dwc:catalogNumber',
 'dwc:class',
 'dwc:classs',
 'dwc:collectionCode',
 'dwc:collectionID',
 'dwc:continent',
 'dwc:coordinatePrecision',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:countryCode',
 'dwc:county',
 'dwc:dataGeneralizations',
 'dwc:datasetID',
 'dwc:datasetName',
 'dwc:dateIdentified',
 'dwc:day',
 'dwc

In [10]:
df = pd.read_csv(
    in_file, dtype=str, keep_default_na=False,
    usecols=['coreid',
             'dwc:decimalLatitude', 'dwc:decimalLongitude',
             'dwc:geodeticDatum', 'dwc:coordinateUncertaintyInMeters',
             'dwc:locality', 'dwc:verbatimLocality',
             'dwc:countryCode', 'dwc:stateProvince', 'dwc:county',
            ])

df = df.rename(columns={
    'dwc:decimalLatitude': 'lat',
    'dwc:decimalLongitude': 'lng',
    'dwc:geodeticDatum': 'datum',
    'dwc:coordinateUncertaintyInMeters': 'uncert',
    'dwc:locality': 'locality',
    'dwc:verbatimLocality': 'locality2',
    'dwc:countryCode': 'country',
    'dwc:stateProvince': 'state',
    'dwc:county': 'county',
})

df.head()

Unnamed: 0,coreid,uncert,country,county,lat,lng,datum,locality,state,locality2
0,14d9348d-1cbe-4df5-86f6-85db1d53c1dc,,,,55.01806,12.6875,,Ven N 1 km,Skåne,
1,09a5fcbf-520d-4785-b56b-9619aaaaebcc,,,,56.619,16.49732,,"Station Linné, Skogsby, Mörbylånga, Trap ID 2006",,
2,1c4a9867-1798-46b4-a4ee-e1dc2cc72431,,,,56.619,16.49732,,"Station Linné, Skogsby, Mörbylånga, Trap ID 2006",,
3,3db40206-d039-4283-bd5f-d4d619f02d4e,,,,56.619,16.49732,,"Station Linné, Skogsby, Mörbylånga, Trap ID 2006",,
4,d9787b9f-228f-409c-a9a1-1169d0ba33d3,,,,56.619,16.49732,,"Station Linné, Skogsby, Mörbylånga, Trap ID 2006",,


In [11]:
df.to_csv(out_file, index=False)

## insects_gbif

In [12]:
in_file = RAW / 'insects_gbif.csv.gz'
out_file = PRUNED / 'insects_gbif.csv.gz'

In [13]:
headers = !zcat "$in_file" | head -1
headers = headers[0].split('\t')
headers

['gbifID',
 'datasetKey',
 'occurrenceID',
 'kingdom',
 'phylum',
 'class',
 'order',
 'family',
 'genus',
 'species',
 'infraspecificEpithet',
 'taxonRank',
 'scientificName',
 'verbatimScientificName',
 'verbatimScientificNameAuthorship',
 'countryCode',
 'locality',
 'stateProvince',
 'occurrenceStatus',
 'individualCount',
 'publishingOrgKey',
 'decimalLatitude',
 'decimalLongitude',
 'coordinateUncertaintyInMeters',
 'coordinatePrecision',
 'elevation',
 'elevationAccuracy',
 'depth',
 'depthAccuracy',
 'eventDate',
 'day',
 'month',
 'year',
 'taxonKey',
 'speciesKey',
 'basisOfRecord',
 'institutionCode',
 'collectionCode',
 'catalogNumber',
 'recordNumber',
 'identifiedBy',
 'dateIdentified',
 'license',
 'rightsHolder',
 'recordedBy',
 'typeStatus',
 'establishmentMeans',
 'lastInterpreted',
 'mediaType',
 'issue']

In [14]:
reader = pd.read_csv(
    in_file, sep='\t', dtype=str, keep_default_na=False,
    chunksize=CHUNK,
    usecols=['gbifID',
             'decimalLatitude', 'decimalLongitude',
             'coordinateUncertaintyInMeters',
             'locality',
             'countryCode', 'stateProvince'])

first_chunk = True
for df in tqdm(reader):
    df = df.rename(columns={
        'gbifID': 'gbifid',
        'decimalLatitude': 'lat',
        'decimalLongitude': 'lng',
        'coordinateUncertaintyInMeters': 'uncert',
        'locality': 'locality',
        'countryCode': 'country',
        'stateProvince': 'state',
    })

    if first_chunk:
        df.to_csv(out_file, index=False)
    else:
        df.to_csv(out_file, index=False, mode='a', header=False)
    first_chunk = False

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## plants_gbif/verbatim

In [15]:
in_file = RAW / 'plants_gbif' / 'verbatim.txt.gz'
out_file = PRUNED / 'plants_gbif_verbatim.csv.gz'

In [16]:
headers = !zcat "$in_file" | head -1
headers = headers[0].split('\t')
headers

['gbifID',
 'abstract',
 'accessRights',
 'accrualMethod',
 'accrualPeriodicity',
 'accrualPolicy',
 'alternative',
 'audience',
 'available',
 'bibliographicCitation',
 'conformsTo',
 'contributor',
 'coverage',
 'created',
 'creator',
 'date',
 'dateAccepted',
 'dateCopyrighted',
 'dateSubmitted',
 'description',
 'educationLevel',
 'extent',
 'format',
 'hasFormat',
 'hasPart',
 'hasVersion',
 'identifier',
 'instructionalMethod',
 'isFormatOf',
 'isPartOf',
 'isReferencedBy',
 'isReplacedBy',
 'isRequiredBy',
 'isVersionOf',
 'issued',
 'language',
 'license',
 'mediator',
 'medium',
 'modified',
 'provenance',
 'publisher',
 'references',
 'relation',
 'replaces',
 'requires',
 'rights',
 'rightsHolder',
 'source',
 'spatial',
 'subject',
 'tableOfContents',
 'temporal',
 'title',
 'type',
 'valid',
 'institutionID',
 'collectionID',
 'datasetID',
 'institutionCode',
 'collectionCode',
 'datasetName',
 'ownerInstitutionCode',
 'basisOfRecord',
 'informationWithheld',
 'dataGeneral

In [17]:
reader = pd.read_csv(
    in_file, sep='\t', dtype=str, keep_default_na=False,
    chunksize=CHUNK,
    usecols=['gbifID',
             'decimalLatitude', 'decimalLongitude',
             'geodeticDatum',
             'coordinateUncertaintyInMeters',
             'locality', 'verbatimLocality',
             'countryCode', 'stateProvince', 'county',
            ])

first_chunk = True
for df in tqdm(reader):
    df = df.rename(columns={
        'gbifID': 'gbifid',
        'decimalLatitude': 'lat',
        'decimalLongitude': 'lng',
        'geodeticDatum': 'datum',
        'coordinateUncertaintyInMeters': 'uncert',
        'locality': 'locality',
        'verbatimLocality': 'locality2',
        'countryCode': 'country',
        'stateProvince': 'state',
    })

    if first_chunk:
        df.to_csv(out_file, index=False)
    else:
        df.to_csv(out_file, index=False, mode='a', header=False)
    first_chunk = False

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## plants_idigbio/occurrence_raw

In [18]:
in_file = RAW / 'plants_idigbio' / 'occurrence_raw.csv.gz'
out_file = PRUNED / 'plants_idigbio_occurrence_raw.csv.gz'

In [19]:
headers = !zcat "$in_file" | head -1
headers = headers[0].split(',')
headers

['coreid',
 'aec:associatedTaxa',
 'dc:rights',
 'dcterms:accessRights',
 'dcterms:bibliographicCitation',
 'dcterms:language',
 'dcterms:license',
 'dcterms:modified',
 'dcterms:references',
 'dcterms:rights',
 'dcterms:rightsHolder',
 'dcterms:source',
 'dcterms:type',
 'dwc:Identification',
 'dwc:MeasurementOrFact',
 'dwc:ResourceRelationship',
 'dwc:VerbatimEventDate',
 'dwc:acceptedNameUsage',
 'dwc:acceptedNameUsageID',
 'dwc:accessRights',
 'dwc:associatedMedia',
 'dwc:associatedOccurrences',
 'dwc:associatedOrganisms',
 'dwc:associatedReferences',
 'dwc:associatedSequences',
 'dwc:associatedTaxa',
 'dwc:basisOfRecord',
 'dwc:bed',
 'dwc:behavior',
 'dwc:catalogNumber',
 'dwc:class',
 'dwc:classs',
 'dwc:collectionCode',
 'dwc:collectionID',
 'dwc:continent',
 'dwc:coordinatePrecision',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:countryCode',
 'dwc:county',
 'dwc:dataGeneralizations',
 'dwc:datasetID',
 'dwc:datasetName',
 'dwc:dateIdentified',
 'dwc:day',
 'dwc

In [24]:
reader = pd.read_csv(
    in_file, dtype=str, keep_default_na=False,
    chunksize=CHUNK,
    usecols=['coreid',
             'dwc:decimalLatitude', 'dwc:decimalLongitude',
             'dwc:geodeticDatum',
             'dwc:coordinateUncertaintyInMeters',
             'dwc:locality', 'dwc:verbatimLocality',
             'dwc:countryCode', 'dwc:stateProvince', 'dwc:county',
            ])

first_chunk = True
for df in tqdm(reader):
    df = df.rename(columns={
        'dwc:decimalLatitude': 'lat',
        'dwc:decimalLongitude': 'lng',
        'dwc:geodeticDatum': 'datum',
        'dwc:coordinateUncertaintyInMeters': 'uncert',
        'dwc:locality': 'locality',
        'dwc:verbatimLocality': 'locality2',
        'dwc:countryCode': 'country',
        'dwc:stateProvince': 'state',
        'dwc:county': 'county',
    })
    if first_chunk:
        df.to_csv(out_file, index=False)
    else:
        df.to_csv(out_file, index=False, mode='a', header=False)
    first_chunk = False

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




## mammals_gbif/verbatim

In [27]:
in_file = RAW / 'mammals_gbif' / 'verbatim.txt.gz'
out_file = PRUNED / 'mammals_gbif_verbatim.csv.gz'

In [28]:
headers = !zcat "$in_file" | head -1
headers = headers[0].split('\t')
headers

['gbifID',
 'abstract',
 'accessRights',
 'accrualMethod',
 'accrualPeriodicity',
 'accrualPolicy',
 'alternative',
 'audience',
 'available',
 'bibliographicCitation',
 'conformsTo',
 'contributor',
 'coverage',
 'created',
 'creator',
 'date',
 'dateAccepted',
 'dateCopyrighted',
 'dateSubmitted',
 'description',
 'educationLevel',
 'extent',
 'format',
 'hasFormat',
 'hasPart',
 'hasVersion',
 'identifier',
 'instructionalMethod',
 'isFormatOf',
 'isPartOf',
 'isReferencedBy',
 'isReplacedBy',
 'isRequiredBy',
 'isVersionOf',
 'issued',
 'language',
 'license',
 'mediator',
 'medium',
 'modified',
 'provenance',
 'publisher',
 'references',
 'relation',
 'replaces',
 'requires',
 'rights',
 'rightsHolder',
 'source',
 'spatial',
 'subject',
 'tableOfContents',
 'temporal',
 'title',
 'type',
 'valid',
 'institutionID',
 'collectionID',
 'datasetID',
 'institutionCode',
 'collectionCode',
 'datasetName',
 'ownerInstitutionCode',
 'basisOfRecord',
 'informationWithheld',
 'dataGeneral

In [30]:
df = pd.read_csv(
    in_file, sep='\t', dtype=str, keep_default_na=False,
    usecols=['gbifID',
             'decimalLatitude', 'decimalLongitude',
             'geodeticDatum',
             'coordinateUncertaintyInMeters',
             'locality', 'verbatimLocality',
             'countryCode', 'stateProvince', 'county',
            ])

df = df.rename(columns={
    'gbifID': 'gbifid',
    'decimalLatitude': 'lat',
    'decimalLongitude': 'lng',
    'geodeticDatum': 'datum',
    'coordinateUncertaintyInMeters': 'uncert',
    'locality': 'locality',
    'verbatimLocality': 'locality2',
    'countryCode': 'country',
    'stateProvince': 'state',
})

df.head()

Unnamed: 0,gbifid,country,state,county,locality,locality2,lat,lng,datum,uncert,prec
0,1317216260,,Texas,,Lomita Ranch,,,,,,
1,1317220289,,,,Duekoue,,6.73,-7.33,,,
2,1317226785,,New York,Essex County,Hammondville,,,,,,
3,1317248971,,,,"Nata, 13 Mi W",,-20.18,26.0,,,
4,1317280627,,,Tanganyika Territory,Kondoa Irangi,,,,,,


In [31]:
df.to_csv(out_file, index=False)

## mammals_idigbio

In [32]:
in_file = RAW / 'mammals_idigbio' / 'occurrence_raw.csv.gz'
out_file = PRUNED / 'mammals_idigbio_occurrence_raw.csv.gz'

In [33]:
headers = !zcat "$in_file" | head -1
headers = headers[0].split(',')
headers

['coreid',
 'aec:associatedTaxa',
 'dc:rights',
 'dcterms:accessRights',
 'dcterms:bibliographicCitation',
 'dcterms:language',
 'dcterms:license',
 'dcterms:modified',
 'dcterms:references',
 'dcterms:rights',
 'dcterms:rightsHolder',
 'dcterms:source',
 'dcterms:type',
 'dwc:Identification',
 'dwc:MeasurementOrFact',
 'dwc:ResourceRelationship',
 'dwc:VerbatimEventDate',
 'dwc:acceptedNameUsage',
 'dwc:acceptedNameUsageID',
 'dwc:accessRights',
 'dwc:associatedMedia',
 'dwc:associatedOccurrences',
 'dwc:associatedOrganisms',
 'dwc:associatedReferences',
 'dwc:associatedSequences',
 'dwc:associatedTaxa',
 'dwc:basisOfRecord',
 'dwc:bed',
 'dwc:behavior',
 'dwc:catalogNumber',
 'dwc:class',
 'dwc:classs',
 'dwc:collectionCode',
 'dwc:collectionID',
 'dwc:continent',
 'dwc:coordinatePrecision',
 'dwc:coordinateUncertaintyInMeters',
 'dwc:country',
 'dwc:countryCode',
 'dwc:county',
 'dwc:dataGeneralizations',
 'dwc:datasetID',
 'dwc:datasetName',
 'dwc:dateIdentified',
 'dwc:day',
 'dwc

In [34]:
df = pd.read_csv(
    in_file, dtype=str, keep_default_na=False,
    usecols=['coreid',
             'dwc:decimalLatitude', 'dwc:decimalLongitude',
             'dwc:geodeticDatum',
             'dwc:coordinateUncertaintyInMeters',
             'dwc:locality', 'dwc:verbatimLocality',
             'dwc:countryCode', 'dwc:stateProvince', 'dwc:county',
            ])

df = df.rename(columns={
    'dwc:decimalLatitude': 'lat',
    'dwc:decimalLongitude': 'lng',
    'dwc:geodeticDatum': 'datum',
    'dwc:coordinateUncertaintyInMeters': 'uncert',
    'dwc:locality': 'locality',
    'dwc:verbatimLocality': 'locality2',
    'dwc:countryCode': 'country',
    'dwc:stateProvince': 'state',
    'dwc:county': 'county',
})

df.head()

Unnamed: 0,coreid,prec,uncert,country,county,lat,lng,datum,locality,state,locality2
0,d3364458-f60e-452b-8616-e0797596dfc1,,,,,,,,"Lomma, on city dump",Skåne,
1,9035546a-38ba-45f0-bfb4-5db9adfe8587,,,,,,,,"Kullaberg, Krapperup, Ekeön",Skåne,
2,a541fe9f-63a2-4639-828b-b9ac7c4d653b,,,,,,,,"Kullaberg, Krapperup, Ekeön",Skåne,
3,02b70ce3-ba59-4965-8def-081cfe57c27e,,,,,,,,"Linsell, Fågelåsen","Härjedalen,",
4,c6001f74-6698-433e-be17-791f4481c5db,,,,,,,,,,


In [35]:
df.to_csv(out_file, index=False)