# Merging together the raw iNaturalist exports

## README:  
This is pretty straightforward -- I had 12 raw exports from the iNaturalist "Export Observations" tool (https://www.inaturalist.org/observations/export), and here I am simply merging them together into a usable CSV file: `'../data/combined_raw_inaturalist_export.csv'`  

### imports

In [1]:
import pandas as pd
import os

# Merge the multiple iNaturalist exported files (each one is capped at 200,000 observations)

### specify input files (raw csv.zip exports from the iNaturalist export tool)

In [2]:
# folder where the input files are stored
raw_obs_directory = '../../raw_inaturalist_exports/'

In [3]:
# input file names (as downloaded from inaturalist)
filenames = ['observations-365743.csv.zip',
 'observations-365693.csv.zip',
 'observations-365799.csv.zip',
 'observations-365778.csv.zip',
 'observations-365833.csv.zip',
 'observations-366053.csv.zip',
 'observations-365710.csv.zip',
 'observations-365581.csv.zip',
 'observations-365843.csv.zip',
 'observations-365660.csv.zip',
 'observations-365727.csv.zip',
 'observations-365820.csv.zip']

In [4]:
# Specify the full file paths
file_paths = [os.path.join(raw_obs_directory,i) for i in filenames]

### read in input files, concatenate them and write out one big file

In [5]:
# Specify an output file path
output_path = '../data/combined_raw_inaturalist_export.csv'

In [6]:
# Initialize an empty DataFrame to hold the concatenated data
all_data = pd.DataFrame()

# Loop through file paths and read each file, then concatenate them into all_data
for file_path in file_paths:
    try:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path,low_memory=False)
        
        # Concatenate the DataFrame from the file with the main DataFrame
        all_data = pd.concat([all_data, df], ignore_index=True)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except pd.errors.EmptyDataError:
        print(f"No data in file: {file_path}")
    except pd.errors.ParserError:
        print(f"Error parsing data from file: {file_path}")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

# Write the concatenated data to a new CSV file
all_data.to_csv(output_path, index=False)


# Check some basic info on our combined data

In [7]:
# what are the column names?
all_data.keys()

Index(['id', 'observed_on_string', 'observed_on', 'time_observed_at',
       'time_zone', 'user_id', 'user_login', 'user_name', 'created_at',
       'updated_at', 'quality_grade', 'license', 'url', 'image_url',
       'sound_url', 'tag_list', 'description', 'num_identification_agreements',
       'num_identification_disagreements', 'captive_cultivated',
       'oauth_application_id', 'place_guess', 'latitude', 'longitude',
       'positional_accuracy', 'private_place_guess', 'private_latitude',
       'private_longitude', 'public_positional_accuracy', 'geoprivacy',
       'taxon_geoprivacy', 'coordinates_obscured', 'positioning_method',
       'positioning_device', 'species_guess', 'scientific_name', 'common_name',
       'iconic_taxon_name', 'taxon_id'],
      dtype='object')

In [8]:
# how many total observations are there?
len(all_data)

1763821

In [9]:
# what is the lat/lon spread of the data?
import toyplot
subsample = all_data.sample(5000) # randomly subsampling rows from the dataframe makes this lighter to plot
toyplot.scatterplot(subsample.longitude,subsample.latitude,width=1000,height=600);