In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymongo

## Extract Victorian Biodiversity Atlas (VBA) fauna data

In [2]:
# Read just the column names in fauna data csv
col_names = pd.read_csv("../data/VBA_2015_2020.csv", nrows = 0).columns
col_names

Index(['RECORD_ID', 'SITE_ID', 'SURVEY_ID', 'PROJECT_ID', 'TAXON_ID',
       'SCI_NAME', 'COMM_NAME', 'RECORDTYPE', 'RELIABILTY', 'TOTALCOUNT',
       'STARTDATE', 'START_YEAR', 'START_MTH', 'ENDDATE', 'END_YEAR',
       'END_MTH', 'LOCN_DESC', 'TAXON_TYPE', 'LONG_DD94', 'LAT_DD94'],
      dtype='object')

In [3]:
# Set data types for columns with data types other than strings
dtypes_dict = {
    "TOTALCOUNT": int,
    "START_YEAR": int,
    "START_MTH": int,
    "END_YEAR": int,
    "END_MTH": int,
    "LONG_DD94": float,
    "LAT_DD94": float
}

In [4]:
# Read in vic fauna csv from 2015 to 2020
fauna_data = pd.read_csv(
    "../data/VBA_2015_2020.csv",
    parse_dates = ["STARTDATE", "ENDDATE"],
    dtype = {col: str for col in col_names if col not in dtypes_dict})
fauna_data.head()

Unnamed: 0,RECORD_ID,SITE_ID,SURVEY_ID,PROJECT_ID,TAXON_ID,SCI_NAME,COMM_NAME,RECORDTYPE,RELIABILTY,TOTALCOUNT,STARTDATE,START_YEAR,START_MTH,ENDDATE,END_YEAR,END_MTH,LOCN_DESC,TAXON_TYPE,LONG_DD94,LAT_DD94
0,8597419,947931,1405903,4377,1557,Paratya australiensis,Common Freshwater Shrimp,Observation,Confirmed,0,2016-12-16,2016,12,NaT,0,0,McCallum Creek-4_7-TR-16-333,"Mussels, decopod crustacea",143.649002,-37.283901
1,9067844,1084677,1776514,5326,10408,Colluricincla harmonica,Grey Shrike-thrush,Observation with supporting evidence,Acceptable,0,2018-08-28,2018,8,2018-09-25,2018,9,345-513-0003 FSQ1,Passerine birds,145.768997,-37.782501
2,8218590,771970,1221401,4366,10991,Turdus merula,Common Blackbird,Seen,Acceptable,0,2015-04-12,2015,4,NaT,0,0,Ocean Acres Bush Park Nature Reserve,Passerine birds,144.287399,-38.315601
3,9047388,1070861,1760792,5326,11242,Wallabia bicolor,Black-tailed Wallaby,Observation with supporting evidence,Acceptable,0,2018-11-14,2018,11,2018-12-14,2018,12,833-518-0004 BUQ1,Mammals,148.848099,-37.601601
4,9359539,1116727,1809370,5543,10525,Cisticola exilis,Golden-headed Cisticola,Seen,Acceptable,1,2015-05-07,2015,5,2015-05-07,2015,5,MANNIBADAR (581481),Passerine birds,143.481903,-37.781799


## Transform VBA fauna data

In [5]:
# Column Renaming
fauna_df = fauna_data.rename(columns={
    "RECORD_ID": "record_id",
    "SITE_ID": "site_id",
    "SURVEY_ID": "survey_id",
    "PROJECT_ID": "project_id",
    "TAXON_ID": "taxon_id",
    "SCI_NAME": "sci_name",
    "COMM_NAME": "comm_name",
    "RECORDTYPE": "recordtype",
    "RELIABILTY": "reliability",
    "TOTALCOUNT": "totalcount",
    "STARTDATE": "start_date",
    "START_YEAR": "start_year",
    "START_MTH": "start_mth",
    "ENDDATE": "end_date",
    "END_YEAR": "end_year",
    "END_MTH": "end_mth",
    "LOCN_DESC": "location_desc",
    "TAXON_TYPE": "taxon_type",
    "LONG_DD94": "long",
    "LAT_DD94": "lat"})

In [6]:
# Test record_id uniqueness
fauna_df.record_id.is_unique

True

In [7]:
print(f"Number of unique record ids: {fauna_df.record_id.nunique()}")

Number of unique record ids: 346829


In [8]:
# Test survey_id uniqueness
fauna_df.survey_id.is_unique

False

In [9]:
print(f"Number of unique survey ids: {fauna_df.survey_id.nunique()}")

Number of unique survey ids: 97240


In [10]:
# Test site_id uniqueness
fauna_df.site_id.is_unique

False

In [11]:
print(f"Number of unique site ids: {fauna_df.site_id.nunique()}")

Number of unique site ids: 55164


In [12]:
# Test project_id uniqueness
fauna_df.project_id.is_unique

False

In [13]:
print(f"Number of unique project ids: {fauna_df.project_id.nunique()}")

Number of unique project ids: 522


In [14]:
# Test taxon_id uniqueness
fauna_df.taxon_id.is_unique

False

In [15]:
print(f"Number of unique taxon ids: {fauna_df.taxon_id.nunique()}")

Number of unique taxon ids: 941


In [16]:
print(f"Number of unique taxon types: {fauna_df.taxon_type.nunique()}")

Number of unique taxon types: 14


In [17]:
# Reorder the columns
fauna_df = fauna_df[["record_id", "survey_id", "site_id", "project_id", "taxon_id", "taxon_type"
                     ,"comm_name", "sci_name", "totalcount", "location_desc", "long", "lat"
                     ,"end_year", "end_mth", "end_date", "start_year", "start_mth", "start_date"
                     ,"recordtype", "reliability"]]
fauna_df.head()

Unnamed: 0,record_id,survey_id,site_id,project_id,taxon_id,taxon_type,comm_name,sci_name,totalcount,location_desc,long,lat,end_year,end_mth,end_date,start_year,start_mth,start_date,recordtype,reliability
0,8597419,1405903,947931,4377,1557,"Mussels, decopod crustacea",Common Freshwater Shrimp,Paratya australiensis,0,McCallum Creek-4_7-TR-16-333,143.649002,-37.283901,0,0,NaT,2016,12,2016-12-16,Observation,Confirmed
1,9067844,1776514,1084677,5326,10408,Passerine birds,Grey Shrike-thrush,Colluricincla harmonica,0,345-513-0003 FSQ1,145.768997,-37.782501,2018,9,2018-09-25,2018,8,2018-08-28,Observation with supporting evidence,Acceptable
2,8218590,1221401,771970,4366,10991,Passerine birds,Common Blackbird,Turdus merula,0,Ocean Acres Bush Park Nature Reserve,144.287399,-38.315601,0,0,NaT,2015,4,2015-04-12,Seen,Acceptable
3,9047388,1760792,1070861,5326,11242,Mammals,Black-tailed Wallaby,Wallabia bicolor,0,833-518-0004 BUQ1,148.848099,-37.601601,2018,12,2018-12-14,2018,11,2018-11-14,Observation with supporting evidence,Acceptable
4,9359539,1809370,1116727,5543,10525,Passerine birds,Golden-headed Cisticola,Cisticola exilis,1,MANNIBADAR (581481),143.481903,-37.781799,2015,5,2015-05-07,2015,5,2015-05-07,Seen,Acceptable


In [18]:
# Overview of the fauna data
fauna_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346829 entries, 0 to 346828
Data columns (total 20 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   record_id      346829 non-null  object        
 1   survey_id      346829 non-null  object        
 2   site_id        346829 non-null  object        
 3   project_id     346829 non-null  object        
 4   taxon_id       346829 non-null  object        
 5   taxon_type     346829 non-null  object        
 6   comm_name      346829 non-null  object        
 7   sci_name       346829 non-null  object        
 8   totalcount     346829 non-null  int64         
 9   location_desc  346829 non-null  object        
 10  long           346829 non-null  float64       
 11  lat            346829 non-null  float64       
 12  end_year       346829 non-null  int64         
 13  end_mth        346829 non-null  int64         
 14  end_date       52465 non-null   datetime64[ns]
 15  

In [19]:
# We can see that the TOTALCOUNT of some records is 0. Let's have an overview of them
zero_totalcount = fauna_df[fauna_df["totalcount"] == 0]
zero_totalcount.head(10)

Unnamed: 0,record_id,survey_id,site_id,project_id,taxon_id,taxon_type,comm_name,sci_name,totalcount,location_desc,long,lat,end_year,end_mth,end_date,start_year,start_mth,start_date,recordtype,reliability
0,8597419,1405903,947931,4377,1557,"Mussels, decopod crustacea",Common Freshwater Shrimp,Paratya australiensis,0,McCallum Creek-4_7-TR-16-333,143.649002,-37.283901,0,0,NaT,2016,12,2016-12-16,Observation,Confirmed
1,9067844,1776514,1084677,5326,10408,Passerine birds,Grey Shrike-thrush,Colluricincla harmonica,0,345-513-0003 FSQ1,145.768997,-37.782501,2018,9,2018-09-25,2018,8,2018-08-28,Observation with supporting evidence,Acceptable
2,8218590,1221401,771970,4366,10991,Passerine birds,Common Blackbird,Turdus merula,0,Ocean Acres Bush Park Nature Reserve,144.287399,-38.315601,0,0,NaT,2015,4,2015-04-12,Seen,Acceptable
3,9047388,1760792,1070861,5326,11242,Mammals,Black-tailed Wallaby,Wallabia bicolor,0,833-518-0004 BUQ1,148.848099,-37.601601,2018,12,2018-12-14,2018,11,2018-11-14,Observation with supporting evidence,Acceptable
6,9067841,1776514,1084677,5326,10488,Passerine birds,White-browed Scrubwren,Sericornis frontalis,0,345-513-0003 FSQ1,145.768997,-37.782501,2018,9,2018-09-25,2018,8,2018-08-28,Observation with supporting evidence,Acceptable
9,8897604,1686135,1000965,4335,528552,Mammals,Red Fox,Vulpes vulpes,0,Basalt 10,144.098206,-37.2705,2018,2,2018-02-07,2018,1,2018-01-17,Observation with supporting evidence,Acceptable
10,9345216,1799930,1107289,2936,5140,Fish,Dry waterbody,Misc Dry,0,Pig and Whistle Creek-2_23-TR-18-421,147.853394,-37.532299,0,0,NaT,2018,11,2018-11-16,Observation,Confirmed
12,8432377,1292811,840246,4551,10364,Passerine birds,Willie Wagtail,Rhipidura leucophrys,0,Bulla Hill and School Hill,144.8022,-37.634102,0,0,NaT,2015,5,2015-05-18,Observation,Acceptable
13,8956790,1715391,1027944,4836,11003,Mammals,Short-beaked Echidna,Tachyglossus aculeatus,0,New Holland Mouse camera survey_Site_NHM423,147.533798,-38.084,2018,3,2018-03-20,2018,3,2018-03-07,Observation with supporting evidence,Acceptable
15,8994856,1741637,1052493,5326,11115,Mammals,Mountain Brush-tailed Possum,Trichosurus cunninghami,0,298-516-0003,145.521393,-37.442902,2018,8,2018-08-23,2018,7,2018-07-27,Observation with supporting evidence,Acceptable


In [20]:
# Percentage of number of rows with totalcount equal to 0 against total number of rows of the dataframe
(zero_totalcount.shape[0]/fauna_df.shape[0])*100

19.936914156544002

## Filter VBA fauna data against scraped data

In [21]:
# Import the webscraped animal data
scraped_df = pd.read_csv("../data/animal_image_to_merge.csv", dtype="str")
scraped_df.head()

Unnamed: 0,animal_name,image_url,image_alternative,introduction,threat_paragraph
0,Alpine She-oak Skink,https://www.zoo.org.au/media/2050/1023_alpine_...,Alpine She-oak Skink sunning it self on a rock...,Found in only a few locations in Victoria and ...,Major threats\nFire is a huge danger to the Al...
1,Baw Baw Frog,https://www.zoo.org.au/media/2052/21295_baw_ba...,Baw Baw Frog resting in bright green moss. Loo...,All estimates point to extinction in the wild ...,Major threats\nThe loss of the Baw Baw Frog is...
2,Brush-tailed Rock-wallaby,https://www.zoo.org.au/media/2045/21882_brush-...,Brush Tailed Rock Wallabies resting in the grass.,"In Victoria, the Brush-tailed Rock-wallaby now...",Major threats \nChanges to habitat and the imp...
3,Eastern Barred Bandicoot,https://www.zoo.org.au/media/2053/4376_eastern...,Small Eastern Barred Bandicoot side view forag...,The Eastern Barred Bandicoot is listed as exti...,The plan for recovery\nZoos Victoria has partn...
4,Giant Burrowing Frog,https://www.zoo.org.au/media/2056/23479_giant_...,Giant Burrowing Frog on wet rocks side view. T...,Although we know that populations of the Giant...,"['Over the next five years, Zoos Victoria will..."


In [22]:
# Extract list of unique animals of interest
species = scraped_df["animal_name"].unique().tolist()

In [23]:
# Filter the fauna data with the species of interest
short_fauna_df = fauna_df[fauna_df["comm_name"].isin(species)]
short_fauna_df.head()

Unnamed: 0,record_id,survey_id,site_id,project_id,taxon_id,taxon_type,comm_name,sci_name,totalcount,location_desc,long,lat,end_year,end_mth,end_date,start_year,start_mth,start_date,recordtype,reliability
102,8261909,1225877,833395,4236,11141,Mammals,Leadbeater's Possum,Gymnobelideus leadbeateri,0,DSS3B,145.841904,-37.8456,0,0,NaT,2016,4,2016-04-24,Observation,Confirmed
195,8590335,1401349,943395,4095,11141,Mammals,Leadbeater's Possum,Gymnobelideus leadbeateri,1,DELWP Case Reference number 2017-0060,146.288498,-37.9226,0,0,NaT,2017,8,2017-08-20,Seen,Confirmed
391,6942095,1102090,766516,4078,11141,Mammals,Leadbeater's Possum,Gymnobelideus leadbeateri,1,Mon1_camera1,145.934403,-37.8186,2015,9,2015-09-29,2015,9,2015-09-08,Observation,Confirmed
426,8596613,1405576,947609,1,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,5,"Mclaughlans Lane Pipetrack, Plenty",145.108398,-37.676399,2017,9,2017-09-25,2017,9,2017-09-25,Observation,Acceptable
538,6870927,1085373,716545,4078,11141,Mammals,Leadbeater's Possum,Gymnobelideus leadbeateri,1,40.2_camera2,146.115204,-37.784802,2015,5,2015-05-11,2015,4,2015-04-17,Observation,Confirmed


In [24]:
# Overview of the fauna data after filtering
short_fauna_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2712 entries, 102 to 346720
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   record_id      2712 non-null   object        
 1   survey_id      2712 non-null   object        
 2   site_id        2712 non-null   object        
 3   project_id     2712 non-null   object        
 4   taxon_id       2712 non-null   object        
 5   taxon_type     2712 non-null   object        
 6   comm_name      2712 non-null   object        
 7   sci_name       2712 non-null   object        
 8   totalcount     2712 non-null   int64         
 9   location_desc  2712 non-null   object        
 10  long           2712 non-null   float64       
 11  lat            2712 non-null   float64       
 12  end_year       2712 non-null   int64         
 13  end_mth        2712 non-null   int64         
 14  end_date       899 non-null    datetime64[ns]
 15  start_year     27

In [25]:
# Check for any extreme values
print(f"Maximum total count is: {short_fauna_df.totalcount.max()}")
print(f"Minimum total count is: {short_fauna_df.totalcount.min()}")
print(f"Maximum longitude is: {short_fauna_df.long.max()}")
print(f"Minimum longitude is: {short_fauna_df.long.min()}")
print(f"Maximum latitude is: {short_fauna_df.lat.max()}")
print(f"Minimum latitude is: {short_fauna_df.lat.min()}")
print(f"Maximum end year is: {short_fauna_df.end_year.max()}")
print(f"Minimum end year is: {short_fauna_df.end_year.min()}")
print(f"Maximum end month is: {short_fauna_df.end_mth.max()}")
print(f"Minimum end month is: {short_fauna_df.end_mth.min()}")
print(f"Maximum start year is: {short_fauna_df.start_year.max()}")
print(f"Minimum start year is: {short_fauna_df.start_year.min()}")
print(f"Maximum start month is: {short_fauna_df.start_mth.max()}")
print(f"Minimum start month is: {short_fauna_df.start_mth.min()}")
print(f"Maximum start date is: {short_fauna_df.start_date.max()}")
print(f"Minimum start date is: {short_fauna_df.start_date.min()}")

Maximum total count is: 128
Minimum total count is: 0
Maximum longitude is: 149.9367981
Minimum longitude is: 140.9933014
Maximum latitude is: -34.5940018
Minimum latitude is: -39.0321007
Maximum end year is: 2020
Minimum end year is: 0
Maximum end month is: 12
Minimum end month is: 0
Maximum start year is: 2020
Minimum start year is: 2015
Maximum start month is: 12
Minimum start month is: 1
Maximum start date is: 2020-03-30 00:00:00
Minimum start date is: 2015-01-01 00:00:00


As can be seen, there are a number of records with total count of 0. They are records of surveys with no sightings of a targeted specie. Hence, we'll remove them.

As there are a lot of null end_date values, their extracted end years and end months equal to 0. Hence we might use start date in our time series visualisation. The null end dates might indicate that a survey hasn't ended up to our group's data extraction.

In [26]:
print(f"The number of records with totalcount of zero: {short_fauna_df[short_fauna_df.totalcount == 0].shape[0]}")

The number of records with totalcount of zero: 623


In [27]:
# Values in location description column
short_fauna_df["location_desc"].unique()

array(['DSS3B', 'DELWP Case Reference number 2017-0060', 'Mon1_camera1',
       ..., '316A', '316B', '316C'], dtype=object)

In [28]:
# Values in record types column
short_fauna_df["recordtype"].unique()

array(['Observation', 'Seen', 'Observation with supporting evidence',
       'Captured and released', 'Indirect evidence', 'Heard', 'Captured'],
      dtype=object)

In [29]:
# Values in reliability column
short_fauna_df["reliability"].unique()

array(['Confirmed', 'Acceptable'], dtype=object)

Values in the location description, record types and reliability columns do not seem to be informative enough. Hence we'll remove these columns. We'll also remove columns project_id and site_id as they are not required for our project's purpose.

In [30]:
# Filter out the records with total count of 0 and remove end_year, end_date, and end_mth columns
final_fauna_df = short_fauna_df[short_fauna_df.totalcount > 0].drop([
    'site_id', 'project_id', "location_desc", 'end_year', 'end_mth', 'end_date', 'recordtype', 'reliability'], axis = 1)
final_fauna_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2089 entries, 195 to 346720
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   record_id   2089 non-null   object        
 1   survey_id   2089 non-null   object        
 2   taxon_id    2089 non-null   object        
 3   taxon_type  2089 non-null   object        
 4   comm_name   2089 non-null   object        
 5   sci_name    2089 non-null   object        
 6   totalcount  2089 non-null   int64         
 7   long        2089 non-null   float64       
 8   lat         2089 non-null   float64       
 9   start_year  2089 non-null   int64         
 10  start_mth   2089 non-null   int64         
 11  start_date  2089 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(3), object(6)
memory usage: 212.2+ KB


In [31]:
# Sort the fauna dataframe by comm_name and start_date
final_fauna_df.sort_values(by=["comm_name", "start_date"], ascending=False, inplace=True)
final_fauna_df.head(10)

Unnamed: 0,record_id,survey_id,taxon_id,taxon_type,comm_name,sci_name,totalcount,long,lat,start_year,start_mth,start_date
215900,10101482,1974369,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,3,142.891296,-37.227798,2019,7,2019-07-31
251630,10101212,1974245,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,24,142.959793,-37.034199,2019,7,2019-07-28
183286,10101124,1974203,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,2,145.136703,-37.910301,2019,7,2019-07-27
115272,10101013,1974141,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,1,144.324997,-36.8265,2019,7,2019-07-25
254338,10002266,1934731,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,1,144.506195,-38.239498,2019,7,2019-07-14
329159,9595974,1834431,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,1,142.725494,-37.0028,2019,7,2019-07-12
135869,10100209,1973712,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,2,144.711807,-36.962601,2019,7,2019-07-09
177207,10099902,1973554,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,2,144.613297,-37.314499,2019,7,2019-07-04
213876,10099632,1973406,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,15,144.672394,-37.940201,2019,6,2019-06-30
87682,10099502,1973343,10309,Non-passerine birds,Swift Parrot,Lathamus discolor,1,144.780395,-37.860199,2019,6,2019-06-26


## Filter webscraped animal image data against VBA fauna data 

In [32]:
final_animal_list = final_fauna_df["comm_name"].unique().tolist()
final_animal_list

['Swift Parrot',
 'Regent Honeyeater',
 'New Holland Mouse',
 'Mountain Pygmy-possum',
 'Mallee Emu-wren',
 "Leadbeater's Possum",
 'Hooded Plover',
 'Greater Glider',
 'Giant Burrowing Frog']

In [33]:
# Number of final animals
len(final_animal_list)

9

In [34]:
# Number of taxon ids
taxon_ids = final_fauna_df["taxon_id"].unique().tolist()
len(taxon_ids)

9

The number of animals is equal to the number of taxon ids. For each animal of interest, there is only one corresponding taxon id.

In [35]:
# Filter the webscraped data to have only the above animals
final_scraped_df = scraped_df[scraped_df["animal_name"].isin(final_animal_list)].copy()
final_scraped_df

Unnamed: 0,animal_name,image_url,image_alternative,introduction,threat_paragraph
4,Giant Burrowing Frog,https://www.zoo.org.au/media/2056/23479_giant_...,Giant Burrowing Frog on wet rocks side view. T...,Although we know that populations of the Giant...,"['Over the next five years, Zoos Victoria will..."
11,Leadbeater's Possum,https://www.zoo.org.au/media/2057/22861_leadbe...,Close up view of the face of a Leadbeater Poss...,"Once thought to be extinct, the Leadbeater's P...",The major threats\nThe loss of hollow-bearing ...
13,Mallee Emu-wren,https://www.zoo.org.au/media/1961/23483_mallee...,Mallee Emu-wren in long dry grass looking at t...,The Mallee Emu-wren is particularly vulnerable...,"['In fact, it was a series of fires that cause..."
14,Mountain Pygmy-possum,https://www.zoo.org.au/media/2058/16910_mounta...,Mountain Pygmy Possum standing on its hind leg...,Mountain Pygmy-possums were thought to be exti...,"The major threats \nClimate change, the loss o..."
15,New Holland Mouse,https://www.zoo.org.au/media/1732/new-holland-...,New Holland Mouse getting a health check wrapp...,The New Holland Mouse is classified as extinct...,The major threats\nThe New Holland Mouse is in...
19,Regent Honeyeater,https://www.zoo.org.au/media/2055/22249_regent...,Regent Honeyeater bird on a branch looking dow...,The Regent Honeyeater has been in decline sinc...,The major threats\nThe loss of the Box-Ironbar...
25,Swift Parrot,https://www.zoo.org.au/media/1960/23484_swift_...,Green Swift Parrot perched on a branch looking...,The Swift Parrot is critically endangered.,['Unless we can solve the issues surrounding i...
27,Greater Glider,https://www.environment.vic.gov.au/__data/asse...,Greater Glider,,Species InformationThe Greater Glider is Austr...
28,Hooded Plover,https://www.environment.vic.gov.au/__data/asse...,Hooded Plover,Species Information\n\nHooded Plovers are a re...,Threats\n\nHooded Plovers often share their oc...


In [36]:
# Overview of the scraped data
final_scraped_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 4 to 28
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   animal_name        9 non-null      object
 1   image_url          9 non-null      object
 2   image_alternative  9 non-null      object
 3   introduction       8 non-null      object
 4   threat_paragraph   9 non-null      object
dtypes: object(5)
memory usage: 432.0+ bytes


In [37]:
# Fill the NaN values with None values for the json-converted file to work
final_scraped_df = final_scraped_df.where(final_scraped_df.notnull(), None)
final_scraped_df

Unnamed: 0,animal_name,image_url,image_alternative,introduction,threat_paragraph
4,Giant Burrowing Frog,https://www.zoo.org.au/media/2056/23479_giant_...,Giant Burrowing Frog on wet rocks side view. T...,Although we know that populations of the Giant...,"['Over the next five years, Zoos Victoria will..."
11,Leadbeater's Possum,https://www.zoo.org.au/media/2057/22861_leadbe...,Close up view of the face of a Leadbeater Poss...,"Once thought to be extinct, the Leadbeater's P...",The major threats\nThe loss of hollow-bearing ...
13,Mallee Emu-wren,https://www.zoo.org.au/media/1961/23483_mallee...,Mallee Emu-wren in long dry grass looking at t...,The Mallee Emu-wren is particularly vulnerable...,"['In fact, it was a series of fires that cause..."
14,Mountain Pygmy-possum,https://www.zoo.org.au/media/2058/16910_mounta...,Mountain Pygmy Possum standing on its hind leg...,Mountain Pygmy-possums were thought to be exti...,"The major threats \nClimate change, the loss o..."
15,New Holland Mouse,https://www.zoo.org.au/media/1732/new-holland-...,New Holland Mouse getting a health check wrapp...,The New Holland Mouse is classified as extinct...,The major threats\nThe New Holland Mouse is in...
19,Regent Honeyeater,https://www.zoo.org.au/media/2055/22249_regent...,Regent Honeyeater bird on a branch looking dow...,The Regent Honeyeater has been in decline sinc...,The major threats\nThe loss of the Box-Ironbar...
25,Swift Parrot,https://www.zoo.org.au/media/1960/23484_swift_...,Green Swift Parrot perched on a branch looking...,The Swift Parrot is critically endangered.,['Unless we can solve the issues surrounding i...
27,Greater Glider,https://www.environment.vic.gov.au/__data/asse...,Greater Glider,,Species InformationThe Greater Glider is Austr...
28,Hooded Plover,https://www.environment.vic.gov.au/__data/asse...,Hooded Plover,Species Information\n\nHooded Plovers are a re...,Threats\n\nHooded Plovers often share their oc...


## Load

In [38]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define database and collections
db = client.animal_visual_db
vba_fauna = db.vba_fauna
scraped_fauna = db.scraped_fauna

In [39]:
# Drops collections if available to remove duplicates
vba_fauna.drop()
scraped_fauna.drop()

In [40]:
# Load vba fauna data into the vba_fauna collection
vba_fauna.insert_many(final_fauna_df.to_dict('records'))

<pymongo.results.InsertManyResult at 0x14fdf6974c8>

In [41]:
# Load scraped fauna image and info into the scraped_fauna collection
scraped_fauna.insert_many(final_scraped_df.to_dict('records'))

<pymongo.results.InsertManyResult at 0x14fd4d09188>

In [42]:
from pprint import pprint

for record in vba_fauna.find():
    pprint(record)

{'_id': ObjectId('5f695a1d145902aefb85b74c'),
 'comm_name': 'Swift Parrot',
 'lat': -37.2277985,
 'long': 142.89129640000002,
 'record_id': '10101482',
 'sci_name': 'Lathamus discolor',
 'start_date': datetime.datetime(2019, 7, 31, 0, 0),
 'start_mth': 7,
 'start_year': 2019,
 'survey_id': '1974369',
 'taxon_id': '10309',
 'taxon_type': 'Non-passerine birds',
 'totalcount': 3}
{'_id': ObjectId('5f695a1d145902aefb85b74d'),
 'comm_name': 'Swift Parrot',
 'lat': -37.0341988,
 'long': 142.95979309999998,
 'record_id': '10101212',
 'sci_name': 'Lathamus discolor',
 'start_date': datetime.datetime(2019, 7, 28, 0, 0),
 'start_mth': 7,
 'start_year': 2019,
 'survey_id': '1974245',
 'taxon_id': '10309',
 'taxon_type': 'Non-passerine birds',
 'totalcount': 24}
{'_id': ObjectId('5f695a1d145902aefb85b74e'),
 'comm_name': 'Swift Parrot',
 'lat': -37.910301200000006,
 'long': 145.13670349999998,
 'record_id': '10101124',
 'sci_name': 'Lathamus discolor',
 'start_date': datetime.datetime(2019, 7, 27,

 'start_date': datetime.datetime(2018, 9, 30, 0, 0),
 'start_mth': 9,
 'start_year': 2018,
 'survey_id': '1960049',
 'taxon_id': '10309',
 'taxon_type': 'Non-passerine birds',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85b7c2'),
 'comm_name': 'Swift Parrot',
 'lat': -36.159099600000005,
 'long': 146.64160159999997,
 'record_id': '10065215',
 'sci_name': 'Lathamus discolor',
 'start_date': datetime.datetime(2018, 9, 29, 0, 0),
 'start_mth': 9,
 'start_year': 2018,
 'survey_id': '1960000',
 'taxon_id': '10309',
 'taxon_type': 'Non-passerine birds',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85b7c3'),
 'comm_name': 'Swift Parrot',
 'lat': -37.0838013,
 'long': 143.69160459999998,
 'record_id': '10062744',
 'sci_name': 'Lathamus discolor',
 'start_date': datetime.datetime(2018, 9, 8, 0, 0),
 'start_mth': 9,
 'start_year': 2018,
 'survey_id': '1958882',
 'taxon_id': '10309',
 'taxon_type': 'Non-passerine birds',
 'totalcount': 25}
{'_id': ObjectId('5f695a1d145902aefb85

 'lat': -37.7206001,
 'long': 145.04840090000002,
 'record_id': '10049025',
 'sci_name': 'Lathamus discolor',
 'start_date': datetime.datetime(2018, 3, 28, 0, 0),
 'start_mth': 3,
 'start_year': 2018,
 'survey_id': '1952945',
 'taxon_id': '10309',
 'taxon_type': 'Non-passerine birds',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85b81f'),
 'comm_name': 'Swift Parrot',
 'lat': -37.6901016,
 'long': 145.0995941,
 'record_id': '8908206',
 'sci_name': 'Lathamus discolor',
 'start_date': datetime.datetime(2018, 3, 27, 0, 0),
 'start_mth': 3,
 'start_year': 2018,
 'survey_id': '1691366',
 'taxon_id': '10309',
 'taxon_type': 'Non-passerine birds',
 'totalcount': 3}
{'_id': ObjectId('5f695a1d145902aefb85b820'),
 'comm_name': 'Swift Parrot',
 'lat': -37.6901016,
 'long': 145.0993958,
 'record_id': '10048723',
 'sci_name': 'Lathamus discolor',
 'start_date': datetime.datetime(2018, 3, 25, 0, 0),
 'start_mth': 3,
 'start_year': 2018,
 'survey_id': '1952804',
 'taxon_id': '10309',
 'taxon

 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85b892'),
 'comm_name': 'New Holland Mouse',
 'lat': -37.91490170000001,
 'long': 147.2779999,
 'record_id': '8206803',
 'sci_name': 'Pseudomys novaehollandiae',
 'start_date': datetime.datetime(2016, 3, 14, 0, 0),
 'start_mth': 3,
 'start_year': 2016,
 'survey_id': '1218192',
 'taxon_id': '11455',
 'taxon_type': 'Mammals',
 'totalcount': 3}
{'_id': ObjectId('5f695a1d145902aefb85b893'),
 'comm_name': 'New Holland Mouse',
 'lat': -37.9343987,
 'long': 147.2870026,
 'record_id': '8206819',
 'sci_name': 'Pseudomys novaehollandiae',
 'start_date': datetime.datetime(2016, 3, 14, 0, 0),
 'start_mth': 3,
 'start_year': 2016,
 'survey_id': '1218203',
 'taxon_id': '11455',
 'taxon_type': 'Mammals',
 'totalcount': 6}
{'_id': ObjectId('5f695a1d145902aefb85b894'),
 'comm_name': 'New Holland Mouse',
 'lat': -37.908500700000005,
 'long': 147.2931061,
 'record_id': '8206812',
 'sci_name': 'Pseudomys novaehollandiae',
 'start_date': datetime.dateti

 'lat': -34.7555008,
 'long': 142.33839419999998,
 'record_id': '10065257',
 'sci_name': 'Stipiturus mallee',
 'start_date': datetime.datetime(2018, 9, 29, 0, 0),
 'start_mth': 9,
 'start_year': 2018,
 'survey_id': '1960014',
 'taxon_id': '10527',
 'taxon_type': 'Passerine birds',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85b909'),
 'comm_name': 'Mallee Emu-wren',
 'lat': -34.7555008,
 'long': 142.33839419999998,
 'record_id': '10065259',
 'sci_name': 'Stipiturus mallee',
 'start_date': datetime.datetime(2018, 9, 29, 0, 0),
 'start_mth': 9,
 'start_year': 2018,
 'survey_id': '1960016',
 'taxon_id': '10527',
 'taxon_type': 'Passerine birds',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85b90a'),
 'comm_name': 'Mallee Emu-wren',
 'lat': -34.7839012,
 'long': 141.0677948,
 'record_id': '10065105',
 'sci_name': 'Stipiturus mallee',
 'start_date': datetime.datetime(2018, 9, 28, 0, 0),
 'start_mth': 9,
 'start_year': 2018,
 'survey_id': '1959951',
 'taxon_id': '10527',
 

 'taxon_id': '10527',
 'taxon_type': 'Passerine birds',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85b98d'),
 'comm_name': 'Mallee Emu-wren',
 'lat': -35.799400299999995,
 'long': 141.552597,
 'record_id': '8986211',
 'sci_name': 'Stipiturus mallee',
 'start_date': datetime.datetime(2015, 11, 11, 0, 0),
 'start_mth': 11,
 'start_year': 2015,
 'survey_id': '1734106',
 'taxon_id': '10527',
 'taxon_type': 'Passerine birds',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85b98e'),
 'comm_name': 'Mallee Emu-wren',
 'lat': -35.9943008,
 'long': 141.6280975,
 'record_id': '8986214',
 'sci_name': 'Stipiturus mallee',
 'start_date': datetime.datetime(2015, 11, 11, 0, 0),
 'start_mth': 11,
 'start_year': 2015,
 'survey_id': '1734109',
 'taxon_id': '10527',
 'taxon_type': 'Passerine birds',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85b98f'),
 'comm_name': 'Mallee Emu-wren',
 'lat': -35.8002014,
 'long': 141.5578003,
 'record_id': '8986209',
 'sci_name': 'Stipiturus m

 'start_mth': 10,
 'start_year': 2019,
 'survey_id': '1982470',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85b9e7'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.805999799999995,
 'long': 146.3235016,
 'record_id': '10131964',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2019, 10, 1, 0, 0),
 'start_mth': 10,
 'start_year': 2019,
 'survey_id': '1982452',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85b9e8'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.804798100000006,
 'long': 146.32420349999998,
 'record_id': '10131967',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2019, 10, 1, 0, 0),
 'start_mth': 10,
 'start_year': 2019,
 'survey_id': '1982454',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85b9e9'),
 'comm_name': "Leadbeater's Possum",
 'lat': -3

 'survey_id': '1733991',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85ba5b'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.819599200000006,
 'long': 146.1479034,
 'record_id': '8986762',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2018, 8, 25, 0, 0),
 'start_mth': 8,
 'start_year': 2018,
 'survey_id': '1734181',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85ba5c'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.536201500000004,
 'long': 145.60960390000002,
 'record_id': '8986761',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2018, 8, 16, 0, 0),
 'start_mth': 8,
 'start_year': 2018,
 'survey_id': '1734180',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85ba5d'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.7751007,
 'long': 145.9927063,
 'record_i

 'record_id': '8584039',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2016, 10, 25, 0, 0),
 'start_mth': 10,
 'start_year': 2016,
 'survey_id': '1397803',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bacb'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.83959960000001,
 'long': 146.3580017,
 'record_id': '8584035',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2016, 10, 25, 0, 0),
 'start_mth': 10,
 'start_year': 2016,
 'survey_id': '1397800',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bacc'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.838501,
 'long': 146.3578033,
 'record_id': '8584032',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2016, 10, 25, 0, 0),
 'start_mth': 10,
 'start_year': 2016,
 'survey_id': '1397799',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'total

 'start_year': 2016,
 'survey_id': '1218263',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bb24'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.8652992,
 'long': 145.8789063,
 'record_id': '8206890',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2016, 1, 21, 0, 0),
 'start_mth': 1,
 'start_year': 2016,
 'survey_id': '1218250',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bb25'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.8572998,
 'long': 145.8533936,
 'record_id': '8206903',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2016, 1, 21, 0, 0),
 'start_mth': 1,
 'start_year': 2016,
 'survey_id': '1218259',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bb26'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.799099,
 'long': 145.8800964,
 'record_id':

{'_id': ObjectId('5f695a1d145902aefb85bb6b'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.8527985,
 'long': 146.24659730000002,
 'record_id': '6942338',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2015, 10, 20, 0, 0),
 'start_mth': 10,
 'start_year': 2015,
 'survey_id': '1102258',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bb6c'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.930000299999996,
 'long': 146.3294067,
 'record_id': '6942355',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2015, 10, 19, 0, 0),
 'start_mth': 10,
 'start_year': 2015,
 'survey_id': '1102271',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bb6d'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.778400399999995,
 'long': 146.14480590000002,
 'record_id': '6942331',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime

 'lat': -37.7454987,
 'long': 146.2458038,
 'record_id': '6870902',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2015, 4, 23, 0, 0),
 'start_mth': 4,
 'start_year': 2015,
 'survey_id': '1085359',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bbb5'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.760299700000004,
 'long': 146.24139399999999,
 'record_id': '6870893',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2015, 4, 23, 0, 0),
 'start_mth': 4,
 'start_year': 2015,
 'survey_id': '1085353',
 'taxon_id': '11141',
 'taxon_type': 'Mammals',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bbb6'),
 'comm_name': "Leadbeater's Possum",
 'lat': -37.7478981,
 'long': 146.24969480000001,
 'record_id': '6870906',
 'sci_name': 'Gymnobelideus leadbeateri',
 'start_date': datetime.datetime(2015, 4, 23, 0, 0),
 'start_mth': 4,
 'start_year': 2015,
 'survey_id': '1085360',
 '

 'long': 146.3088074,
 'record_id': '10099630',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2019, 6, 29, 0, 0),
 'start_mth': 6,
 'start_year': 2019,
 'survey_id': '1973405',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bc04'),
 'comm_name': 'Hooded Plover',
 'lat': -38.281299600000004,
 'long': 144.4328003,
 'record_id': '10002036',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2019, 6, 28, 0, 0),
 'start_mth': 6,
 'start_year': 2019,
 'survey_id': '1934647',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bc05'),
 'comm_name': 'Hooded Plover',
 'lat': -38.5438995,
 'long': 145.33949280000002,
 'record_id': '10099152',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2019, 6, 22, 0, 0),
 'start_mth': 6,
 'start_year': 2019,
 'survey_id': '1973160',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalco

 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2019, 2, 17, 0, 0),
 'start_mth': 2,
 'start_year': 2019,
 'survey_id': '1967999',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 12}
{'_id': ObjectId('5f695a1d145902aefb85bc5a'),
 'comm_name': 'Hooded Plover',
 'lat': -38.3564987,
 'long': 142.31089780000002,
 'record_id': '10086017',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2019, 2, 16, 0, 0),
 'start_mth': 2,
 'start_year': 2019,
 'survey_id': '1967934',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bc5b'),
 'comm_name': 'Hooded Plover',
 'lat': -38.267601,
 'long': 144.50619509999999,
 'record_id': '9999478',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2019, 2, 16, 0, 0),
 'start_mth': 2,
 'start_year': 2019,
 'survey_id': '1933583',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bc5

 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bc9e'),
 'comm_name': 'Hooded Plover',
 'lat': -38.7653008,
 'long': 143.66999819999998,
 'record_id': '9996853',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 12, 29, 0, 0),
 'start_mth': 12,
 'start_year': 2018,
 'survey_id': '1932777',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bc9f'),
 'comm_name': 'Hooded Plover',
 'lat': -37.857399,
 'long': 148.0915985,
 'record_id': '9021134',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 12, 28, 0, 0),
 'start_mth': 12,
 'start_year': 2018,
 'survey_id': '1754774',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bca0'),
 'comm_name': 'Hooded Plover',
 'lat': -38.457698799999996,
 'long': 145.2964935,
 'record_id': '10076614',
 'sci_name': 'Thinornis cucullatus',
 'start_

 'record_id': '10068984',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 10, 28, 0, 0),
 'start_mth': 10,
 'start_year': 2018,
 'survey_id': '1961657',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bcf0'),
 'comm_name': 'Hooded Plover',
 'lat': -38.3819008,
 'long': 142.23109440000002,
 'record_id': '10068335',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 10, 24, 0, 0),
 'start_mth': 10,
 'start_year': 2018,
 'survey_id': '1961401',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 8}
{'_id': ObjectId('5f695a1d145902aefb85bcf1'),
 'comm_name': 'Hooded Plover',
 'lat': -38.3550987,
 'long': 142.3352966,
 'record_id': '10068255',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 10, 23, 0, 0),
 'start_mth': 10,
 'start_year': 2018,
 'survey_id': '1961355',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 5}
{'_id': ObjectI

 'lat': -38.7773018,
 'long': 143.6645966,
 'record_id': '9992019',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 7, 19, 0, 0),
 'start_mth': 7,
 'start_year': 2018,
 'survey_id': '1930890',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 5}
{'_id': ObjectId('5f695a1d145902aefb85bd42'),
 'comm_name': 'Hooded Plover',
 'lat': -38.3232994,
 'long': 144.68299869999998,
 'record_id': '10058879',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 7, 18, 0, 0),
 'start_mth': 7,
 'start_year': 2018,
 'survey_id': '1957094',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 3}
{'_id': ObjectId('5f695a1d145902aefb85bd43'),
 'comm_name': 'Hooded Plover',
 'lat': -38.7564011,
 'long': 143.6705933,
 'record_id': '9992010',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 7, 16, 0, 0),
 'start_mth': 7,
 'start_year': 2018,
 'survey_id': '1930884',
 'taxon_id': '10138',
 'taxon_type': 'Waders'

 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bd98'),
 'comm_name': 'Hooded Plover',
 'lat': -37.5845985,
 'long': 149.7368927,
 'record_id': '10043057',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 2, 3, 0, 0),
 'start_mth': 2,
 'start_year': 2018,
 'survey_id': '1950567',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bd99'),
 'comm_name': 'Hooded Plover',
 'lat': -38.2750015,
 'long': 144.5036926,
 'record_id': '9989181',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 2, 3, 0, 0),
 'start_mth': 2,
 'start_year': 2018,
 'survey_id': '1929580',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bd9a'),
 'comm_name': 'Hooded Plover',
 'lat': -38.2750015,
 'long': 144.5036926,
 'record_id': '9989220',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2018, 2, 3, 0, 0),
 'start_mth': 2,
 'star

 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bdf0'),
 'comm_name': 'Hooded Plover',
 'lat': -38.5438995,
 'long': 145.33949280000002,
 'record_id': '10029096',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2017, 10, 16, 0, 0),
 'start_mth': 10,
 'start_year': 2017,
 'survey_id': '1945273',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bdf1'),
 'comm_name': 'Hooded Plover',
 'lat': -38.428901700000004,
 'long': 144.18099980000002,
 'record_id': '9986246',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2017, 10, 16, 0, 0),
 'start_mth': 10,
 'start_year': 2017,
 'survey_id': '1928356',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bdf2'),
 'comm_name': 'Hooded Plover',
 'lat': -38.5758018,
 'long': 145.51089480000002,
 'record_id': '10029112',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2017, 

 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2017, 6, 8, 0, 0),
 'start_mth': 6,
 'start_year': 2017,
 'survey_id': '1927552',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85be45'),
 'comm_name': 'Hooded Plover',
 'lat': -38.2858009,
 'long': 144.46760559999998,
 'record_id': '9984556',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2017, 6, 7, 0, 0),
 'start_mth': 6,
 'start_year': 2017,
 'survey_id': '1927549',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 3}
{'_id': ObjectId('5f695a1d145902aefb85be46'),
 'comm_name': 'Hooded Plover',
 'lat': -38.2858009,
 'long': 144.46760559999998,
 'record_id': '9984557',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2017, 6, 7, 0, 0),
 'start_mth': 6,
 'start_year': 2017,
 'survey_id': '1927550',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85be47'),

 'start_mth': 2,
 'start_year': 2017,
 'survey_id': '1937907',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85be91'),
 'comm_name': 'Hooded Plover',
 'lat': -38.3564987,
 'long': 142.31089780000002,
 'record_id': '10009949',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2017, 2, 20, 0, 0),
 'start_mth': 2,
 'start_year': 2017,
 'survey_id': '1937912',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85be92'),
 'comm_name': 'Hooded Plover',
 'lat': -38.428901700000004,
 'long': 144.18099980000002,
 'record_id': '9982623',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2017, 2, 17, 0, 0),
 'start_mth': 2,
 'start_year': 2017,
 'survey_id': '1926572',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 5}
{'_id': ObjectId('5f695a1d145902aefb85be93'),
 'comm_name': 'Hooded Plover',
 'lat': -38.428901700000004,
 'long': 144.1809

 'start_year': 2016,
 'survey_id': '1924281',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bee2'),
 'comm_name': 'Hooded Plover',
 'lat': -38.0545006,
 'long': 140.99560549999998,
 'record_id': '10121709',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2016, 11, 9, 0, 0),
 'start_mth': 11,
 'start_year': 2016,
 'survey_id': '1981567',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 1}
{'_id': ObjectId('5f695a1d145902aefb85bee3'),
 'comm_name': 'Hooded Plover',
 'lat': -38.3181,
 'long': 144.3480988,
 'record_id': '9956151',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2016, 10, 25, 0, 0),
 'start_mth': 10,
 'start_year': 2016,
 'survey_id': '1923672',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bee4'),
 'comm_name': 'Hooded Plover',
 'lat': -38.428901700000004,
 'long': 144.18099980000002,
 'record_id': '99558

 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bf2e'),
 'comm_name': 'Hooded Plover',
 'lat': -38.3139,
 'long': 144.35839840000003,
 'record_id': '9855078',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2015, 12, 13, 0, 0),
 'start_mth': 12,
 'start_year': 2015,
 'survey_id': '1915389',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bf2f'),
 'comm_name': 'Hooded Plover',
 'lat': -38.3139,
 'long': 144.35839840000003,
 'record_id': '9855098',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2015, 12, 13, 0, 0),
 'start_mth': 12,
 'start_year': 2015,
 'survey_id': '1915390',
 'taxon_id': '10138',
 'taxon_type': 'Waders',
 'totalcount': 2}
{'_id': ObjectId('5f695a1d145902aefb85bf30'),
 'comm_name': 'Hooded Plover',
 'lat': -38.2837982,
 'long': 144.4264984,
 'record_id': '9851193',
 'sci_name': 'Thinornis cucullatus',
 'start_date': datetime.datetime(2015

In [43]:
for record in scraped_fauna.find():
    pprint(record)

{'_id': ObjectId('5f695a1d145902aefb85bf75'),
 'animal_name': 'Giant Burrowing Frog',
 'image_alternative': 'Giant Burrowing Frog on wet rocks side view. The frog '
                      'is dark brown with yellow lips and spots on its side.',
 'image_url': 'https://www.zoo.org.au/media/2056/23479_giant_burrowing_frog_-_credit_required_offsite1.jpg?anchor=center&mode=crop&quality=75&width=2000&height=570&rnd=132131643480000000',
 'introduction': 'Although we know that populations of the Giant Burrowing '
                 'Frog are in decline, this elusive digger is one of '
                 'Victoria’s most poorly understood species.',
 'threat_paragraph': "['Over the next five years, Zoos Victoria will carry out "
                     'important on-the-ground surveys to discover more about '
                     'the Giant Burrowing Frog and its remote habitat. In the '
                     'meantime, we are securing the wellbeing of the '
                     "individual frogs in our

In [44]:
type(vba_fauna.find())

pymongo.cursor.Cursor

In [45]:
type(scraped_fauna.find())

pymongo.cursor.Cursor

## Test aggregations by animal names

In [46]:
# Aggregate total sightings by each animal (represented in common names, science names, taxon ids and taxon types) over 5 years
metadata = list(
    vba_fauna.aggregate(
    [
          {
                  "$group" : {
                      "_id" :"$comm_name",
                      "scientific_name": { "$first": "$sci_name" },
                      "taxon_id": { "$first": "$taxon_id" },
                      "taxon_type": { "$first": "$taxon_type" },
                      "total_sightings": { "$sum": "$totalcount" },
                  }
          }
    ]))

metadata

[{'_id': 'Mallee Emu-wren',
  'science_name': 'Stipiturus mallee',
  'taxon_id': '10527',
  'taxon_type': 'Passerine birds',
  'totalSightings': 722},
 {'_id': 'Swift Parrot',
  'science_name': 'Lathamus discolor',
  'taxon_id': '10309',
  'taxon_type': 'Non-passerine birds',
  'totalSightings': 2695},
 {'_id': "Leadbeater's Possum",
  'science_name': 'Gymnobelideus leadbeateri',
  'taxon_id': '11141',
  'taxon_type': 'Mammals',
  'totalSightings': 654},
 {'_id': 'Regent Honeyeater',
  'science_name': 'Anthochaera phrygia',
  'taxon_id': '10603',
  'taxon_type': 'Passerine birds',
  'totalSightings': 41},
 {'_id': 'Mountain Pygmy-possum',
  'science_name': 'Burramys parvus',
  'taxon_id': '11156',
  'taxon_type': 'Mammals',
  'totalSightings': 823},
 {'_id': 'Greater Glider',
  'science_name': 'fam. Pseudocheiridae gen. Petauroides',
  'taxon_id': '527397',
  'taxon_type': 'Mammals',
  'totalSightings': 1},
 {'_id': 'New Holland Mouse',
  'science_name': 'Pseudomys novaehollandiae',
  

In [47]:
# Aggregate records by animal name
records_by_animal = list(vba_fauna.aggregate([
    {
        "$group" : {
            "_id" : "$comm_name",
            "record_id": { "$push": "$record_id" },
            "survey_id": { "$push": "$survey_id" },
            "number_sightings": { "$push": "$totalcount" },
            "long": { "$push": "$long" },
            "lat": { "$push": "$lat" },
            "start_year": { "$push": "$start_year" },
            "start_mth": { "$push": "$start_mth" },
            "start_date": { "$push": "$start_date" }
        }
    }
]))

records_by_animal[2]

{'_id': "Leadbeater's Possum",
 'record_id': ['10150668',
  '10150669',
  '10150670',
  '10150672',
  '10150674',
  '10150678',
  '10150675',
  '10179333',
  '10183271',
  '10183274',
  '10183275',
  '10183276',
  '10183278',
  '10183285',
  '10183288',
  '10183294',
  '10183222',
  '10183234',
  '10149640',
  '10149642',
  '10149644',
  '10149648',
  '10149649',
  '10149651',
  '10149657',
  '10142465',
  '10142463',
  '10183077',
  '10183174',
  '10183175',
  '10183177',
  '10183067',
  '10183069',
  '10183074',
  '10149683',
  '10149698',
  '10132689',
  '10132688',
  '10183058',
  '10142462',
  '10134715',
  '10134774',
  '10134639',
  '10134643',
  '10134646',
  '10134649',
  '10134651',
  '10134655',
  '10134662',
  '10134733',
  '10134739',
  '10134724',
  '10134729',
  '10134731',
  '10134743',
  '10134747',
  '10134776',
  '10134778',
  '10134780',
  '10134803',
  '10134811',
  '10134824',
  '10132234',
  '10132227',
  '10132229',
  '10142474',
  '10132790',
  '10132799',
  '1