<h3> R snippet: </h3>

- remove duplicates based on lat,long,species,year 
- add seasons info

Use R kernel to run

In [None]:
library(tidyverse)

total = read.csv("GBif_Original.csv", stringsAsFactors = FALSE, sep="\t")

unq_rows = as.numeric(rownames(unique(total[c("species","year","decimalLatitude","decimalLongitude")])))
total_unq = total[unq_rows,]

total$coor = paste0(total$decimalLatitude, total$decimalLongitude)
total_unq$coor = paste0(total_unq$decimalLatitude, total_unq$decimalLongitude)
month_count = total %>% group_by(species, year, coor) %>% summarise(paste(unique(month), collapse = ", "))
colnames(month_count)[ncol(month_count)] = "months"

total_unq = total_unq[order(total_unq$species, total_unq$year, total_unq$coor),]
month_count = month_count[order(month_count$species, month_count$year, month_count$coor),]
#total_unq$months = month_count$`paste(unique(month), collapse = ", ")`
#sep_months = sapply(total_unq$months, strsplit, ", ")
sep_months = month_count$months

total_unq$Winter = 0
total_unq$Spring = 0
total_unq$Summer = 0
total_unq$Fall = 0
for (row in 1:nrow(total_unq)) {
  if (any(c(1,2,12) %in% sep_months[row][[1]][1])) {total_unq$Winter[row] = 1}
  if (any(c(3:5) %in% sep_months[row][[1]][1])) {total_unq$Spring[row] = 1}
  if (any(c(6:8) %in% sep_months[row][[1]][1])) {total_unq$Summer[row] = 1}
  if (any(c(9:11) %in% sep_months[row][[1]][1])) {total_unq$Fall[row] = 1}
  #if (row %% 15000 == 0) {print(paste("Loop is ", round(row/nrow(total_unq)*100), "% done", sep = ""))}
}

total_unq$coor = NULL

write.csv(total_unq, "GBif_R.csv")


<h3> Python snippets: </h3>

- convert dataset keys to dataset names
- add redList designation
- add common names from ITIS
- drop unnecessary columns
- add geometry from lat, long
- convert data to species per year in geojson

Use Python2 kernel to run

In [59]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
# from geojson import GeometryCollection

import requests
import json
import pickle
import numpy as np
from python_helpers import column_helpers as ch
import sqlite3

In [9]:
df = pd.read_csv("GBif_R.csv", low_memory=False)
df = applyTransformations(df)

In [60]:
def yearSpeciesCount(df):
    df = df.dropna(subset = ['year'])
    df.year = df['year'].astype(int)
    gp_series = df.groupby(['year']).size()
    year_counts = list(zip(gp_series.index, gp_series))

def some(df, n):
    return df.loc[random.sample(df.index, n)]
    
def testingFilter(df):
    df = df[(df['year'] == 2017) | (df['year'] == 2016) | (df['year'] == 2015)]
    df = df[(df["species"] == "Corvus caurinus") | (df["species"] == "Turdus migratorius") |
           (df["species"] == "Larus glaucescens")]
    return df

def applyTransformations(df):
    df = ch.dataSetNamesFromKey(df, read_pickle="datasetKeyNames.pickle")
    df = ch.addCommonNames(df, "ITIS.sqlite")
    df = ch.addRedList(df, "redlist_assessments.csv")
    return df

def pointCSVtoJSONPerYear(df):
    df = df[['species', 'Winter', 'Spring', 'Summer', 'Fall', 'datasetName', 'common', 
                     'redList', 'decimalLatitude', 'decimalLongitude', 'year']]
    geo_df = ch.toGEOdf(df)    
    geo_df = geo_df.drop(['decimalLatitude', 'decimalLongitude'], 1)
    gb = geo_df.groupby(['year'])
    for k, gp in gb:
#        gp.drop(['year'], 1)
       gp.to_file("leaflet/gbif/" + str(int(k)) + ".geojson", driver="GeoJSON")
    return geo_df

def pointCSVtoJSONPerYearPerSpecies(df):
    df = df[['species', 'Winter', 'Spring', 'Summer', 'Fall', 'datasetName', 'common', 
                     'redList', 'decimalLatitude', 'decimalLongitude', 'year']]
    geo_df = ch.toGEOdf(df)
    geo_df = geo_df.drop(['decimalLatitude', 'decimalLongitude'], 1)
    #split into species per year as gejsons            
    gb_year = geo_df.groupby(['year'])
    for k_year, gp_year in gb_year:
        gp_year = gp_year.drop(['year'], 1)
        gp_species = gp_year.groupby(['species'])
        os.mkdir('leaflet/gbif_year_species/' + str(int(k_year)))
        for k_species, gp_species in gp_species:
            gp_species.to_file("map_django/biodivmap/static/biodivmap/gbif_year_species" + str(int(k_year))
                               + '/'+ k_species + ".geojson", driver="GeoJSON")
        print(str(int(k_year)) + " done!")
    return geo_df



In [61]:
BASE_PATH = 'map_django/biodivmap/static/biodivmap/'
taxonLevels = ['gbif','kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
file_count = 0

def pointCSVtoJSONPerYearPerTaxonLimited(df, lim=1000):
    df = df.dropna(axis=0, subset=['species'])
    geo_df = ch.toGEOdf(df)
    geo_df = geo_df.drop(['decimalLatitude', 'decimalLongitude'], 1)
    #split into species per year as gejsons            
    gb_year = geo_df.groupby(['year'])
    os.mkdir(BASE_PATH + 'gbif/')
    master_count = 0
    dbCon = sqlite3.connect("map_django/db.sqlite3")
    cur   = dbCon.cursor()
    for k_year, gp_year in gb_year:
        gp_year = gp_year.drop(['year'], 1)
        year_path = BASE_PATH + 'gbif/' + str(int(k_year)) + "/"
        os.mkdir(year_path)
        master_count += perTaxonWriterYear(0, gp_year, year_path, lim, cur, str(int(k_year)))
        print(str(int(k_year)) + " done!")
    dbCon.commit()
    dbCon.close()
    return geo_df, master_count

def perTaxonWriterYear(taxonIndex, df, path, lim, cur, year):
    if (taxonLevels[taxonIndex] == "species" or df.shape[0] <= lim):
        for spec in list(df.species.unique()):
            cur.execute("UPDATE species_year SET (%s) = ?  WHERE species = ?" %("year_"+ year),(path.replace(BASE_PATH,""), spec,))
        df.to_file(path + "points"+ ".geojson", driver="GeoJSON")
        return 1
    else:
        gb_nextTaxonLevel = df.groupby([taxonLevels[taxonIndex+1]])
        inner_count = 0
        for k_nextTaxon, gp_nextTaxon in gb_nextTaxonLevel:
            next_path = path + k_nextTaxon + "/"
            os.mkdir(next_path)
            inner_count+=perTaxonWriterYear(taxonIndex+1, gp_nextTaxon, next_path, lim, cur,year)
        return inner_count
    

In [62]:
BASE_PATH = 'map_django/biodivmap/static/biodivmap/'
taxonLevels = ['gbif', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
file_count = 0

def pointCSVtoJSONPerRecencyPerTaxonLimited(df, lim=1000):
    df = df.dropna(axis=0, subset=['species'])
    geo_df = ch.toGEOdf(df)
    geo_df = geo_df.drop(['decimalLatitude', 'decimalLongitude'], 1)
    # split into species per recency as gejsons
    gb_rec = geo_df.groupby(['recency'])
    os.mkdir(BASE_PATH + 'gbif/')
    master_count = 0
    dbCon = sqlite3.connect("map_django/db.sqlite3")
    cur = dbCon.cursor()
    for k_rec, gp_rec in gb_rec:
        # gp_rec = gp_rec.drop(['recency'], 1)
        rec_path = BASE_PATH + 'gbif/' + k_rec + "/"
        os.mkdir(rec_path)
        master_count += perTaxonWriterRecency(0, gp_rec, rec_path, lim, cur, k_rec)
        print(k_rec + " done!")
    dbCon.commit()
    dbCon.close()
    return geo_df, master_count


def perTaxonWriterRecency(taxonIndex, df, path, lim, cur, rec):
    if (taxonLevels[taxonIndex] == "species" or df.shape[0] <= lim):
        for spec in list(df.species.unique()):
            cur.execute("UPDATE biodivmap_speciesrecency SET (%s) = ?  WHERE species = ?" % (rec),
                        (path.replace(BASE_PATH, ""), spec,))
        df.to_file(path + "points" + ".geojson", driver="GeoJSON")
        return 1
    else:
        gb_nextTaxonLevel = df.groupby([taxonLevels[taxonIndex + 1]])
        inner_count = 0
        for k_nextTaxon, gp_nextTaxon in gb_nextTaxonLevel:
            next_path = path + k_nextTaxon + "/"
            os.mkdir(next_path)
            inner_count += perTaxonWriterRecency(taxonIndex + 1, gp_nextTaxon, next_path, lim, cur, rec)
        return inner_count

In [63]:
df =pd.read_csv("GBif June27.csv")

In [64]:
df = df.drop(['Unnamed: 0'], 1)

In [65]:
df.recency.unique()

array(['recent', 'old', 'both'], dtype=object)

In [70]:
pointCSVtoJSONPerRecencyPerTaxonLimited(df)

AttributeError: 'module' object has no attribute 'toGEOdf'

In [68]:
def makeYearsColsDB(years, db_path):
    col_query = """CREATE TABLE species_year (
    species TEXT NOT NULL PRIMARY KEY,"""
    for year in years:
        if (year == 2019):
            col_query+="""year_""" + str(year) + """ INTEGER DEFAULT 0"""
        else:
            col_query+="""year_""" + str(year) + """ INTEGER DEFAULT 0,"""
    col_query += """);""" 
    
    dbCon = sqlite3.connect(db_path)
    cur   = dbCon.cursor()
    cur.execute(col_query)
    dbCon.commit()
    dbCon.close()


def fillSpeciesIntoDB(df, db_path):
    dbCon = sqlite3.connect(db_path)
    cur   = dbCon.cursor()
    for spec in list(df.species.unique()):
        cur.execute("INSERT INTO biodivmap_speciesrecency (species) VALUES (?)",(spec,))
    dbCon.commit()
    dbCon.close()

In [69]:
fillSpeciesIntoDB(df, "map_django/db.sqlite3")

In [None]:
sum = 0
for key,gp in geo_df.groupby(["year"]):
    sum+=gp.species.unique().shape[0]
print(sum)

In [14]:
df = pd.read_csv("GBif_recency.csv")

In [17]:
df = df.drop(["Unnamed: 0"],1)

In [42]:
df_rec = df[df['recency'] == 'recent']

<h3> SHAPE FILES </h3>

In [None]:
# SHP data to geoJSON
def fix_crs(map_ob):
    return map_ob.to_crs({'init': 'epsg:4326'})
# str_map = gpd.read_file("ecological_reserves/BC_Eco_Reserves.shp")
str_map = gpd.read_file("MVSEI2014/MVSEI2014.shp")

str_map = fix_crs(str_map)

In [None]:
# # GeoJSON does not support multipolygon. Doesn't work
# str_map.to_file("leaflet/UBC_poly.geojson", driver="GeoJSON")

# # fiona doesn't work
# import fiona
# import json

# with fiona.open('ecological_reserves/BC_Eco_Reserves.shp') as source:
#     records = list(source)
# geo_json = {"type": "FeatureCollection","features": records}
# with open('leaflet/UBC_poly.geojson', 'w') as fp:
#     json.dump(geo_json, fp)

In [None]:
# Convert multipolygon to single polygons

In [None]:
geom_series = str_map.geometry

In [None]:
def geom_apply(x):
    try:
        return list(x)
    except TypeError:
        return [x]

In [None]:
geom_series = geom_series.apply(geom_apply)

In [None]:
geom_series = geom_series.apply(pd.Series).stack()

In [None]:
# # Quick but loses properties
# list_poly = list(geom_series)
# geoms = GeometryCollection(list_poly)
# geo_file = geojson.dumps(geoms)
# with open("leaflet/UBC_poly.geojson", "w") as text_file:
#     text_file.write(geo_file)

In [None]:
# Careful! Deep copy required here to avoid chaining

df = pd.DataFrame(columns=str_map.columns)
for ind, poly in geom_series.iteritems():
    curr_row = str_map.loc[ind[0]].copy(deep=True)
    curr_row['geometry'] = poly
    df = df.append(curr_row)

df_gpd = gpd.GeoDataFrame(df,geometry = df.geometry, crs = {'init': 'epsg:4326'})
df_gpd.to_file("leaflet/SEI.geojson", driver="GeoJSON")

In [None]:
df_gpd.columns

In [None]:
# SHP data to geoJSON
def fix_crs(map_ob):
    return map_ob.to_crs({'init': 'epsg:4326'})
# str_map = gpd.read_file("ecological_reserves/BC_Eco_Reserves.shp")
str_map = gpd.read_file("MVSEI2014/MVSEI2014.shp")

str_map = fix_crs(str_map)

In [None]:
str_map.columns

In [None]:
str_map.Location

In [None]:
str_map = str_map[['SEI_PolyNb', 'Comp1Lgnd', 'geometry']]

In [None]:
str_map

In [None]:
str_map.to_file("leaflet/SEI.geojson", driver="GeoJSON")

In [None]:
def flatten_gdf_geometry(gdf, geom_type):
    geometry = gdf.geometry
    flattened_geometry = []

    flattened_gdf = gpd.GeoDataFrame()

    for geom in geometry:
        if geom.type in ['GeometryCollection', 'MultiPoint', 'MultiLineString', 'MultiPolygon']:
            for subgeom in geom:
                if subgeom.type==geom_type:
                    flattened_geometry.append(subgeom)
        else:
            if geom.type==geom_type:
                flattened_geometry.append(geom)

    flattened_gdf.geometry=flattened_geometry

    return flattened_gdf

In [None]:
new_df = flatten_gdf_geometry(str_map, 'Polygon')

In [None]:
# df = pd.read_csv("GBif_R.csv", low_memory=False)
#     #drop nan species
# df = df.dropna(axis=0, subset=['species'])
# #keep num_rows if num_rows > 0
# #convert dataset keys to dataset names    
# df = dataSetNamesFromKey(df)
# print("Converted dataset keys to names!")
# #add redList designation
# df['redList'] = df.apply(lambda x: redList(x), 1).values
# print("Added redlist designation!")
# #add common names from ITS
# df = addCommonNames(df)
# print("Added common names!")
# df = df[['Winter', 'Spring', 'Summer', 'Fall', 'datasetName', 'common', 
#                  'redList', 'decimalLatitude', 'decimalLongitude', 'year',
#         'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']]

# df.to_csv("Gbif_Gabe.csv", encoding="latin1")