In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Read just the column names in fauna data csv
col_names = pd.read_csv("../1_data/VBA_2015_2020.csv", nrows = 0).columns
col_names

Index(['RECORD_ID', 'SITE_ID', 'SURVEY_ID', 'PROJECT_ID', 'TAXON_ID',
       'SCI_NAME', 'COMM_NAME', 'RECORDTYPE', 'RELIABILTY', 'TOTALCOUNT',
       'STARTDATE', 'START_YEAR', 'START_MTH', 'ENDDATE', 'END_YEAR',
       'END_MTH', 'LOCN_DESC', 'TAXON_TYPE', 'LONG_DD94', 'LAT_DD94'],
      dtype='object')

In [3]:
# Set data types for columns with data types other than strings
dtypes_dict = {
    "TOTALCOUNT": int,
    "START_YEAR": int,
    "START_MTH": int,
    "END_YEAR": int,
    "END_MTH": int,
    "LONG_DD94": float,
    "LAT_DD94": float
}

In [4]:
# Read in vic fauna csv
fauna_data = pd.read_csv(
    "../1_data/VBA_2015_2020.csv",
    parse_dates = ["STARTDATE", "ENDDATE"],
    dtype = {col: str for col in col_names if col not in dtypes_dict})
fauna_data.head()

Unnamed: 0,RECORD_ID,SITE_ID,SURVEY_ID,PROJECT_ID,TAXON_ID,SCI_NAME,COMM_NAME,RECORDTYPE,RELIABILTY,TOTALCOUNT,STARTDATE,START_YEAR,START_MTH,ENDDATE,END_YEAR,END_MTH,LOCN_DESC,TAXON_TYPE,LONG_DD94,LAT_DD94
0,8597419,947931,1405903,4377,1557,Paratya australiensis,Common Freshwater Shrimp,Observation,Confirmed,0,2016-12-16,2016,12,NaT,0,0,McCallum Creek-4_7-TR-16-333,"Mussels, decopod crustacea",143.649002,-37.283901
1,9067844,1084677,1776514,5326,10408,Colluricincla harmonica,Grey Shrike-thrush,Observation with supporting evidence,Acceptable,0,2018-08-28,2018,8,2018-09-25,2018,9,345-513-0003 FSQ1,Passerine birds,145.768997,-37.782501
2,8218590,771970,1221401,4366,10991,Turdus merula,Common Blackbird,Seen,Acceptable,0,2015-04-12,2015,4,NaT,0,0,Ocean Acres Bush Park Nature Reserve,Passerine birds,144.287399,-38.315601
3,9047388,1070861,1760792,5326,11242,Wallabia bicolor,Black-tailed Wallaby,Observation with supporting evidence,Acceptable,0,2018-11-14,2018,11,2018-12-14,2018,12,833-518-0004 BUQ1,Mammals,148.848099,-37.601601
4,9359539,1116727,1809370,5543,10525,Cisticola exilis,Golden-headed Cisticola,Seen,Acceptable,1,2015-05-07,2015,5,2015-05-07,2015,5,MANNIBADAR (581481),Passerine birds,143.481903,-37.781799


In [5]:
# Column Renaming
fauna_df = fauna_data.rename(columns={
    "RECORD_ID": "record_id",
    "SITE_ID": "site_id",
    "SURVEY_ID": "survey_id",
    "PROJECT_ID": "project_id",
    "TAXON_ID": "taxon_id",
    "SCI_NAME": "sci_name",
    "COMM_NAME": "comm_name",
    "RECORDTYPE": "recordtype",
    "RELIABILTY": "reliability",
    "TOTALCOUNT": "totalcount",
    "STARTDATE": "start_date",
    "START_YEAR": "start_year",
    "START_MTH": "start_mth",
    "ENDDATE": "end_date",
    "END_YEAR": "end_year",
    "END_MTH": "end_mth",
    "LOCN_DESC": "location_desc",
    "TAXON_TYPE": "taxon_type",
    "LONG_DD94": "long",
    "LAT_DD94": "lat"})

In [6]:
# Test record_id uniqueness
fauna_df.record_id.is_unique

True

In [7]:
print(f"Number of unique record ids: {fauna_df.record_id.nunique()}")

Number of unique record ids: 346829


In [8]:
# Test survey_id uniqueness
fauna_df.survey_id.is_unique

False

In [9]:
print(f"Number of unique survey ids: {fauna_df.survey_id.nunique()}")

Number of unique survey ids: 97240


In [10]:
# Test site_id uniqueness
fauna_df.site_id.is_unique

False

In [11]:
print(f"Number of unique site ids: {fauna_df.site_id.nunique()}")

Number of unique site ids: 55164


In [12]:
# Test project_id uniqueness
fauna_df.project_id.is_unique

False

In [13]:
print(f"Number of unique project ids: {fauna_df.project_id.nunique()}")

Number of unique project ids: 522


In [14]:
# Test taxon_id uniqueness
fauna_df.taxon_id.is_unique

False

In [15]:
print(f"Number of unique taxon ids: {fauna_df.taxon_id.nunique()}")

Number of unique taxon ids: 941


In [16]:
print(f"Number of unique taxon types: {fauna_df.taxon_type.nunique()}")

Number of unique taxon types: 14


In [17]:
# Reorder the columns
fauna_df = fauna_df[["record_id", "survey_id", "site_id", "project_id", "taxon_id", "taxon_type"
                     ,"comm_name", "sci_name", "totalcount", "location_desc", "long", "lat"
                     ,"end_year", "end_mth", "end_date", "start_year", "start_mth", "start_date"
                     ,"recordtype", "reliability"]]
fauna_df.head()

Unnamed: 0,record_id,survey_id,site_id,project_id,taxon_id,taxon_type,comm_name,sci_name,totalcount,location_desc,long,lat,end_year,end_mth,end_date,start_year,start_mth,start_date,recordtype,reliability
0,8597419,1405903,947931,4377,1557,"Mussels, decopod crustacea",Common Freshwater Shrimp,Paratya australiensis,0,McCallum Creek-4_7-TR-16-333,143.649002,-37.283901,0,0,NaT,2016,12,2016-12-16,Observation,Confirmed
1,9067844,1776514,1084677,5326,10408,Passerine birds,Grey Shrike-thrush,Colluricincla harmonica,0,345-513-0003 FSQ1,145.768997,-37.782501,2018,9,2018-09-25,2018,8,2018-08-28,Observation with supporting evidence,Acceptable
2,8218590,1221401,771970,4366,10991,Passerine birds,Common Blackbird,Turdus merula,0,Ocean Acres Bush Park Nature Reserve,144.287399,-38.315601,0,0,NaT,2015,4,2015-04-12,Seen,Acceptable
3,9047388,1760792,1070861,5326,11242,Mammals,Black-tailed Wallaby,Wallabia bicolor,0,833-518-0004 BUQ1,148.848099,-37.601601,2018,12,2018-12-14,2018,11,2018-11-14,Observation with supporting evidence,Acceptable
4,9359539,1809370,1116727,5543,10525,Passerine birds,Golden-headed Cisticola,Cisticola exilis,1,MANNIBADAR (581481),143.481903,-37.781799,2015,5,2015-05-07,2015,5,2015-05-07,Seen,Acceptable


In [18]:
# Overview of the fauna data
fauna_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346829 entries, 0 to 346828
Data columns (total 20 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   record_id      346829 non-null  object        
 1   survey_id      346829 non-null  object        
 2   site_id        346829 non-null  object        
 3   project_id     346829 non-null  object        
 4   taxon_id       346829 non-null  object        
 5   taxon_type     346829 non-null  object        
 6   comm_name      346829 non-null  object        
 7   sci_name       346829 non-null  object        
 8   totalcount     346829 non-null  int64         
 9   location_desc  346829 non-null  object        
 10  long           346829 non-null  float64       
 11  lat            346829 non-null  float64       
 12  end_year       346829 non-null  int64         
 13  end_mth        346829 non-null  int64         
 14  end_date       52465 non-null   datetime64[ns]
 15  

In [21]:
# We can see that the TOTALCOUNT of some records is 0. Let's see what they are.
zero_totalcount = fauna_df[fauna_df["totalcount"] == 0]
zero_totalcount.head(10)

Unnamed: 0,record_id,survey_id,site_id,project_id,taxon_id,taxon_type,comm_name,sci_name,totalcount,location_desc,long,lat,end_year,end_mth,end_date,start_year,start_mth,start_date,recordtype,reliability
0,8597419,1405903,947931,4377,1557,"Mussels, decopod crustacea",Common Freshwater Shrimp,Paratya australiensis,0,McCallum Creek-4_7-TR-16-333,143.649002,-37.283901,0,0,NaT,2016,12,2016-12-16,Observation,Confirmed
1,9067844,1776514,1084677,5326,10408,Passerine birds,Grey Shrike-thrush,Colluricincla harmonica,0,345-513-0003 FSQ1,145.768997,-37.782501,2018,9,2018-09-25,2018,8,2018-08-28,Observation with supporting evidence,Acceptable
2,8218590,1221401,771970,4366,10991,Passerine birds,Common Blackbird,Turdus merula,0,Ocean Acres Bush Park Nature Reserve,144.287399,-38.315601,0,0,NaT,2015,4,2015-04-12,Seen,Acceptable
3,9047388,1760792,1070861,5326,11242,Mammals,Black-tailed Wallaby,Wallabia bicolor,0,833-518-0004 BUQ1,148.848099,-37.601601,2018,12,2018-12-14,2018,11,2018-11-14,Observation with supporting evidence,Acceptable
6,9067841,1776514,1084677,5326,10488,Passerine birds,White-browed Scrubwren,Sericornis frontalis,0,345-513-0003 FSQ1,145.768997,-37.782501,2018,9,2018-09-25,2018,8,2018-08-28,Observation with supporting evidence,Acceptable
9,8897604,1686135,1000965,4335,528552,Mammals,Red Fox,Vulpes vulpes,0,Basalt 10,144.098206,-37.2705,2018,2,2018-02-07,2018,1,2018-01-17,Observation with supporting evidence,Acceptable
10,9345216,1799930,1107289,2936,5140,Fish,Dry waterbody,Misc Dry,0,Pig and Whistle Creek-2_23-TR-18-421,147.853394,-37.532299,0,0,NaT,2018,11,2018-11-16,Observation,Confirmed
12,8432377,1292811,840246,4551,10364,Passerine birds,Willie Wagtail,Rhipidura leucophrys,0,Bulla Hill and School Hill,144.8022,-37.634102,0,0,NaT,2015,5,2015-05-18,Observation,Acceptable
13,8956790,1715391,1027944,4836,11003,Mammals,Short-beaked Echidna,Tachyglossus aculeatus,0,New Holland Mouse camera survey_Site_NHM423,147.533798,-38.084,2018,3,2018-03-20,2018,3,2018-03-07,Observation with supporting evidence,Acceptable
15,8994856,1741637,1052493,5326,11115,Mammals,Mountain Brush-tailed Possum,Trichosurus cunninghami,0,298-516-0003,145.521393,-37.442902,2018,8,2018-08-23,2018,7,2018-07-27,Observation with supporting evidence,Acceptable


In [23]:
# Count the number of rows with totalcount equal to 0
zero_totalcount.shape[0]

69147

In [24]:
# Percentage of number of rows with totalcount equal to 0 against total number of rows of the dataframe
(zero_totalcount.shape[0]/fauna_df.shape[0])*100

19.936914156544002

In [None]:
# Overvi unique taxon types