<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/Vegetation_Biomass_Grass_Forb_Pooled_Wrangle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# README
* [Readme vegetation biomass](https://docs.google.com/document/d/1AF8qh7YRqEJ_5DjX0GrA3Z52RfGp2f22Jc2uLonGa2I/edit?usp=sharing)
* [Readme vegetation biomass CARTO STAMEN](https://docs.google.com/document/d/1DIsO1yvobDKynRgoSvQ22HI4f8XYy5TPipa2vokGydQ/edit)

# Proposed Schema

* grid_point:INT
* date:DATE (ISO)
* year:DATE (“yyyy”)
* season:STRING (“spring”, “fall”)
  * Derived from DATE
* Headers for biomass of known vegetation types:
  * grass_g:NUMERIC
* In data from 2016-2018
  * forb_g:NUMERIC
* In data from 2016-2018
  * pooled_g:NUMERIC
  * Pooled is the amount for years when the vegetation types were not weighed separately. It is NOT the same as a sum of the biomass in vegetation categories.
Vegetation were pooled in 2010-2015, and again in 2019
* add other vegetation types in the future as needed, for example `exotic_forb_g`, `native_grass_g`
Note that we can produce sums of all vegetation biomass, or biomass in categories (forb, grass, native, exotic) in queries, views, or local environments. 


# Import Tools

In [0]:
# load pandas package for dataframes
import pandas as pd
import numpy as np

# Source Data

In [0]:
## 2020 Spring
# '2020-04-21_spring_biomass_data'
#src_2020_spring = 'https://drive.google.com/open?id=1zof71gBRqVLudM8e1SnLMCQZ7RK3FWSoFBEs-EWFRiM'
# '2020-04-21_spring_biomass_data - Sheet1.csv'
src_2020_spring = 'https://drive.google.com/uc?id=1Q1SvzROHySWxIjG7qSFYK9luwJB6RIJ0'

## 2019 
# '2019 Biomass Data.xlsx'
src_2019 = 'https://drive.google.com/uc?id=1dd5F8R-mD6l7H2morac1r9ongf80xpff'

## 2018
# MPG vegetation biomass 2010-2018 - 2018 Spring.csv
src_2018_spring = 'https://drive.google.com/uc?id=1Ple5R852yoJVA9WfDNS8TWVdQ1kxqFrr'

# MPG vegetation biomass 2010-2018 - 2018 Fall.csv
src_2018_fall = 'https://drive.google.com/uc?id=1kLyoAx0WR_hmHa4tgJbaj4lpdOILhFcj'

## 2017
# MPG vegetation biomass 2010-2018 - 2017 Spring.csv
src_2017_spring = 'https://drive.google.com/uc?id=1YVVc60cXFc9BBSF391TXyTFvXM4KJkcO'

# MPG vegetation biomass 2010-2018 - 2017 Fall.csv
src_2017_fall = 'https://drive.google.com/uc?id=1cZfqSDf_geURgzF6v1vO7n1QeWvJM1z4'

## 2016
# MPG vegetation biomass 2010-2018 - 2016 Spring.csv
src_2016_spring = 'https://drive.google.com/uc?id=1SovIlvFKY2FsOK_ibmm6UMQAsBk7hKEQ'

# MPG vegetation biomass 2010-2018 - 2016 Fall.csv
src_2016_fall = 'https://drive.google.com/uc?id=17W9h7rdUAhsk_vckpoZ8kGIwxSPPxpQl'

## 2010 - 2015
# MPG vegetation biomass 2010-2018 - 2010-2015.csv
# this doesn't look like it contains grass/forb
src_2010_15 = 'https://drive.google.com/uc?id=1Jxj71lx0Wm_uaJmnQTvBE5KNLMlyH4CO'

# Load and Wrangle Data by Year

## 2020

### Spring

In [0]:
df_2020_spring = pd.read_csv(src_2020_spring)

In [0]:
df_2020_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Plot           162 non-null    int64  
 1   year           162 non-null    int64  
 2   season         162 non-null    object 
 3   biomass_100_g  162 non-null    float64
 4   biomass_245_g  162 non-null    float64
 5   biomass_300_g  162 non-null    float64
 6   robel_N        162 non-null    int64  
 7   robel_E        162 non-null    int64  
 8   robel_S        162 non-null    int64  
 9   robel_W        162 non-null    int64  
 10  pellets_Deer   162 non-null    int64  
 11  pellets_Elk    162 non-null    int64  
 12  pellets_Horse  162 non-null    int64  
dtypes: float64(3), int64(9), object(1)
memory usage: 16.6+ KB


In [0]:
# select columns pertinent to biomass
df_2020_spring = df_2020_spring.loc[:, ['Plot', 'biomass_100_g', 'biomass_245_g', 'biomass_300_g']]

In [0]:
# average biomass columns
df_2020_spring['mean'] = df_2020_spring.loc[:, ['biomass_100_g', 'biomass_245_g', 'biomass_300_g']].mean(axis=1)

In [0]:
df_2020_spring.head(2)

Unnamed: 0,Plot,biomass_100_g,biomass_245_g,biomass_300_g,mean
0,9,0.76,4.32,10.11,5.063333
1,10,5.22,1.83,1.56,2.87


In [0]:
# select only grid_point and average columns
df_2020_spring = df_2020_spring.loc[:, ['Plot', 'mean']]

In [0]:
# rename columns
df_2020_spring.columns = ['grid_point', 'pooled_g']

In [0]:
# insert NaT values for missing dates
df_2020_spring['date'] = np.nan

In [0]:
# set year variable
df_2020_spring['year'] = 2020

In [0]:
# set season variable
df_2020_spring['season'] = 'spring'

In [0]:
# set grass biomass
df_2020_spring['grass_g'] = 0

# set forb biomass
df_2020_spring['forb_g'] = 0

In [0]:
# reorder columns
df_2020_spring = df_2020_spring.loc[:, ['grid_point', 'date', 'year', 'season', 'grass_g', 'forb_g', 'pooled_g']]

In [0]:
df_2020_spring.head(2)

Unnamed: 0,grid_point,date,year,season,grass_g,forb_g,pooled_g
0,9,,2020,spring,0,0,5.063333
1,10,,2020,spring,0,0,2.87


## 2019

In [0]:
df_2019 = pd.read_excel(src_2019, sheet_name='All Data - redone')

In [0]:
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347 entries, 0 to 346
Data columns (total 17 columns):
 #   Column                                               Non-Null Count  Dtype         
---  ------                                               --------------  -----         
 0   Plot                                                 347 non-null    int64         
 1   Date                                                 325 non-null    datetime64[ns]
 2   Robel Reading                                        319 non-null    float64       
 3   Unnamed: 3                                           319 non-null    float64       
 4   Unnamed: 4                                           319 non-null    float64       
 5   Unnamed: 5                                           318 non-null    float64       
 6      Biomass Weight (g-spring)(pennyweight for fall)   297 non-null    float64       
 7   Unnamed: 7                                           296 non-null    float64       
 8   

In [0]:
df_2019.head(2)

Unnamed: 0,Plot,Date,Robel Reading,Unnamed: 3,Unnamed: 4,Unnamed: 5,Biomass Weight (g-spring)(pennyweight for fall),Unnamed: 7,Unnamed: 8,Deer,Elk,Horse,Biomass,Unnamed: 13,Robel,Unnamed: 15,Unnamed: 16
0,9,2019-09-24,4.0,8.0,6.0,14.0,6.82,5.78,4.98,2.0,3.0,2.0,23.44,36.451544,8.0,,23.44
1,10,2019-09-24,6.0,4.0,2.0,4.0,15.95,1.36,17.3,0.0,1.0,2.0,46.146667,71.762681,4.0,,93.76


In [0]:
df_2019.columns

Index(['Plot', 'Date', 'Robel Reading', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', '   Biomass Weight (g-spring)(pennyweight for fall) ',
       'Unnamed: 7', 'Unnamed: 8', 'Deer', 'Elk', 'Horse', 'Biomass',
       'Unnamed: 13', 'Robel', 'Unnamed: 15', 'Unnamed: 16'],
      dtype='object')

In [0]:
# Select Biomass columns
df_2019 = df_2019.loc[:, ["Plot", "Date", "   Biomass Weight (g-spring)(pennyweight for fall) ", "Unnamed: 7", "Unnamed: 8"]]

In [0]:
# Display Sample Dates
df_2019.Date.unique()

array(['2019-09-24T00:00:00.000000000',                           'NaT',
       '2019-04-25T00:00:00.000000000', '2019-04-29T00:00:00.000000000',
       '2019-04-18T00:00:00.000000000', '2019-04-30T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [0]:
# Look for row with valid Biomass figures and NaT date
df_2019[df_2019.Date.isnull()]

Unnamed: 0,Plot,Date,Biomass Weight (g-spring)(pennyweight for fall),Unnamed: 7,Unnamed: 8
25,72,NaT,,,
26,72,NaT,,,
48,103,NaT,,,
49,103,NaT,,,
102,197,NaT,29.22,26.35,29.56
148,347,NaT,,,
150,348,NaT,,,
154,353,NaT,,,
156,354,NaT,,,
187,31,NaT,,,


In [0]:
# set date found from data source spreadsheet
df_2019.loc[102, "Date"] = pd.Timestamp('2019-04-29')

In [0]:
df_2019.columns

Index(['Plot', 'Date', '   Biomass Weight (g-spring)(pennyweight for fall) ',
       'Unnamed: 7', 'Unnamed: 8'],
      dtype='object')

In [0]:
# rename biomass measurement columns for mean calculation
df_2019.columns = ['grid_point', 'date', 'measurement_1', 'measurement_2', 'measurement_3']

In [0]:
df_2019.head(2)

Unnamed: 0,grid_point,date,measurement_1,measurement_2,measurement_3
0,9,2019-09-24,6.82,5.78,4.98
1,10,2019-09-24,15.95,1.36,17.3


In [0]:
# average measurement values
df_2019['mean'] = df_2019.loc[:, ['measurement_1', 'measurement_2', 'measurement_3']].mean(axis=1)

In [0]:
df_2019.head(2)

Unnamed: 0,grid_point,date,measurement_1,measurement_2,measurement_3,mean
0,9,2019-09-24,6.82,5.78,4.98,5.86
1,10,2019-09-24,15.95,1.36,17.3,11.536667


In [0]:
# drop individual measurement values
df_2019 = df_2019.loc[:, ["grid_point", "date", "mean"]]

In [0]:
df_2019.head(2)

Unnamed: 0,grid_point,date,mean
0,9,2019-09-24,5.86
1,10,2019-09-24,11.536667


In [0]:
# rename mean to pooled
df_2019.columns = ['grid_point', 'date', 'pooled']

In [0]:
# set year
df_2019['year'] = 2019

### Spring

In [0]:
df_2019.head(2)

Unnamed: 0,grid_point,date,pooled,year
0,9,2019-09-24,5.86,2019
1,10,2019-09-24,11.536667,2019


In [0]:
# set season column to spring
df_2019.loc[df_2019.date < '2019-06', 'season'] = 'spring'

In [0]:
df_2019.loc[df_2019.season == 'spring'].head(2)

Unnamed: 0,grid_point,date,pooled,year,season
102,197,2019-04-29,28.376667,2019,spring
174,9,2019-04-25,,2019,spring


### Fall

In [0]:
# set season column to fall
df_2019.loc[df_2019.date > '2019-06', 'season'] = 'fall'

In [0]:
df_2019.head(2)

Unnamed: 0,grid_point,date,pooled,year,season
0,9,2019-09-24,5.86,2019,fall
1,10,2019-09-24,11.536667,2019,fall


In [0]:
# convert fall values from pennyweight to grams (pennyweight*1.55517) = grams
df_2019.loc[df_2019.season == 'fall', 'pooled'] = df_2019.loc[df_2019.season == 'fall', 'pooled'] * 1.55517

In [0]:
# update pooled column name to pooled_g post pennyweight to grams conversion
df_2019.columns = ['grid_point', 'date', 'pooled_g', 'year', 'season']

In [0]:
df_2019.head(2)

Unnamed: 0,grid_point,date,pooled_g,year,season
0,9,2019-09-24,9.113296,2019,fall
1,10,2019-09-24,17.941478,2019,fall


### Full Year

In [0]:
df_2019.head(2)

Unnamed: 0,grid_point,date,pooled_g,year,season
0,9,2019-09-24,9.113296,2019,fall
1,10,2019-09-24,17.941478,2019,fall


## 2018

### Spring

In [0]:
# load data
df_2018_spring = pd.read_csv(src_2018_spring)

In [0]:
# retrieve top level column names
df_2018_spring.columns

Index(['Unnamed: 0', 'Unnamed: 1', 'Robel', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Biomass', 'Unnamed: 7', 'Unnamed: 8', 'Woody plants',
       'Unnamed: 10', 'Unnamed: 11', 'Scat', 'Unnamed: 13', 'Unnamed: 14',
       '60G', '60F', '180G', '180F', '330G', '330F', 'Unnamed: 21',
       'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25'],
      dtype='object')

In [0]:
# reload data skipping first row
df_2018_spring = pd.read_csv(src_2018_spring, skiprows=1)

In [0]:
df_2018_spring.head(2)

Unnamed: 0,Plot,Date,N,E,S,W,60,180,330,Plot.1,Species,% cover,Deer,Elk,Horse,grams,grams.1,grams.2,grams.3,grams.4,grams.5,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,3,4/10/2018,2.0,4.0,2.0,2.0,G,G,G,,,,2.0,0.0,1.0,9.33,0.0,20.55,0.0,10.44,0.0,,,,,
1,4,4/10/2018,2.0,2.0,4.0,2.0,G,G,G,,,,2.0,6.0,0.0,2.31,0.0,3.44,0.0,12.21,0.0,,,,,


In [0]:
# select columns to keep
df_2018_spring = df_2018_spring[['Plot', 'Date', 'grams', 'grams.1', 'grams.2', 'grams.3', 'grams.4', 'grams.5']]

In [0]:
# rename columns
df_2018_spring.columns = ['grid_point', 'date', '60grass', '60forb', '180grass', '180forb', '330grass', '330forb']

In [0]:
df_2018_spring.tail(2)

Unnamed: 0,grid_point,date,60grass,60forb,180grass,180forb,330grass,330forb
112,484,4/9/2018,0.6,0.61,10.54,2.92,8.45,0.0
113,571,4/9/2018,5.37,0.0,1.86,1.36,2.25,2.25


In [0]:
# average grass
df_2018_spring['grass'] = df_2018_spring[['60grass', '180grass', '330grass']].mean(axis=1)

In [0]:
# average forb
df_2018_spring['forb'] = df_2018_spring[['60forb', '180forb', '330forb']].mean(axis=1)

In [0]:
df_2018_spring.tail(2)

Unnamed: 0,grid_point,date,60grass,60forb,180grass,180forb,330grass,330forb,grass,forb
112,484,4/9/2018,0.6,0.61,10.54,2.92,8.45,0.0,6.53,1.176667
113,571,4/9/2018,5.37,0.0,1.86,1.36,2.25,2.25,3.16,1.203333


In [0]:
# pare DataFrame down to grass, forb averages
df_2018_spring = df_2018_spring[['grid_point', 'date', 'grass', 'forb']]

In [0]:
# coerce 'date' to datetime object
df_2018_spring['date'] = pd.to_datetime(df_2018_spring['date'])

In [0]:
# set 'year' column to 2018
df_2018_spring['year'] = 2018

In [0]:
# set 'season' to spring
df_2018_spring['season'] = 'spring'

In [0]:
df_2018_spring.head(2)

Unnamed: 0,grid_point,date,grass,forb,year,season
0,3,2018-04-10,13.44,0.0,2018,spring
1,4,2018-04-10,5.986667,0.0,2018,spring


In [0]:
# reorder columns
df_2018_spring = df_2018_spring[['grid_point', 'date', 'year', 'season', 'grass', 'forb']]

In [0]:
df_2018_spring.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,3,2018-04-10,2018,spring,13.44,0.0
1,4,2018-04-10,2018,spring,5.986667,0.0


In [0]:
df_2018_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  114 non-null    int64         
 1   date        113 non-null    datetime64[ns]
 2   year        114 non-null    int64         
 3   season      114 non-null    object        
 4   grass       114 non-null    float64       
 5   forb        114 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 5.5+ KB


### Fall

In [0]:
# load data
df_2018_fall = pd.read_csv(src_2018_fall)

In [0]:
# retrieve top level column names
df_2018_fall.columns

Index(['Unnamed: 0', 'Unnamed: 1', 'Robel', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Biomass', 'Unnamed: 7', 'Unnamed: 8', 'Woody plants',
       'Unnamed: 10', 'Unnamed: 11', 'Scat', 'Unnamed: 13', 'Unnamed: 14',
       '110G', '110F', '250G', '250F', '350G', '350F', 'Unnamed: 21'],
      dtype='object')

In [0]:
# reload data skipping first row
df_2018_fall = pd.read_csv(src_2018_fall, skiprows=1)

In [0]:
df_2018_fall.head(2)

Unnamed: 0,Plot,Date,N,E,S,W,110,250,350,Plot.1,Species,% cover,Deer,Elk,Horse,grams,grams.1,grams.2,grams.3,grams.4,grams.5,Unnamed: 21
0,70,10/15/2018,14,7,8,3,F/G,G,F/G,0,0,0,6,8,0,18.23,7.05,35.37,0.0,6.7,6.48,
1,71,10/15/2018,1,1,1,2,F/G,F/G,F/G,0,0,0,1,6,0,3.18,5.35,3.56,10.93,1.04,7.52,


In [0]:
# select columns to keep
df_2018_fall = df_2018_fall.loc[:, ['Plot', 'Date', 'grams', 'grams.1', 'grams.2', 'grams.3', 'grams.4', 'grams.5']]

In [0]:
# rename columns
df_2018_fall.columns = ['grid_point', 'date', '110grass', '110forb', '250grass', '250forb', '350grass', '350forb']

In [0]:
df_2018_fall.head(2)

Unnamed: 0,grid_point,date,110grass,110forb,250grass,250forb,350grass,350forb
0,70,10/15/2018,18.23,7.05,35.37,0.0,6.7,6.48
1,71,10/15/2018,3.18,5.35,3.56,10.93,1.04,7.52


In [0]:
# average grass
df_2018_fall['grass'] = df_2018_fall[['110grass', '250grass', '350grass']].mean(axis=1)

In [0]:
# average forb
df_2018_fall['forb'] = df_2018_fall[['110forb', '250forb', '350forb']].mean(axis=1)

In [0]:
# pare DataFrame down to grass, forb averages
df_2018_fall = df_2018_fall[['grid_point', 'date', 'grass', 'forb']]

In [0]:
df_2018_fall.head(2)

Unnamed: 0,grid_point,date,grass,forb
0,70,10/15/2018,20.1,4.51
1,71,10/15/2018,2.593333,7.933333


In [0]:
# coerce 'date' to datetime object
df_2018_fall['date'] = pd.to_datetime(df_2018_fall['date'])

In [0]:
# select year from 'date' to make 'year' column
df_2018_fall['year'] = df_2018_fall.date.dt.year

In [0]:
# set season in 'season' column
df_2018_fall['season'] = 'fall'

In [0]:
# reorder columns
df_2018_fall = df_2018_fall[['grid_point', 'date', 'year', 'season', 'grass', 'forb']]

In [0]:
df_2018_fall.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,70,2018-10-15,2018,fall,20.1,4.51
1,71,2018-10-15,2018,fall,2.593333,7.933333


In [0]:
df_2018_fall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  47 non-null     int64         
 1   date        47 non-null     datetime64[ns]
 2   year        47 non-null     int64         
 3   season      47 non-null     object        
 4   grass       47 non-null     float64       
 5   forb        47 non-null     float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 2.3+ KB


### Full Year

In [0]:
# concatenate spring and fall dataframes
df_2018_full_year = pd.concat([df_2018_spring, df_2018_fall], ignore_index=True, sort=False)

In [0]:
df_2018_full_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  161 non-null    int64         
 1   date        160 non-null    datetime64[ns]
 2   year        161 non-null    int64         
 3   season      161 non-null    object        
 4   grass       161 non-null    float64       
 5   forb        161 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 7.7+ KB


## 2017

### Spring

In [0]:
# load data
df_2017_spring = pd.read_csv(src_2017_spring)

In [0]:
# retrieve top level column names
df_2017_spring.columns

Index(['Unnamed: 0', 'Unnamed: 1', 'Robel', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Biomass', 'Unnamed: 7', 'Unnamed: 8', 'Woody plants',
       'Unnamed: 10', 'Unnamed: 11', 'Scat', 'Unnamed: 13', 'Unnamed: 14',
       '90 G', '90 F', '200 G', '200 F', '280 G', '280 F'],
      dtype='object')

In [0]:
df_2017_spring.head(2)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Robel,Unnamed: 3,Unnamed: 4,Unnamed: 5,Biomass,Unnamed: 7,Unnamed: 8,Woody plants,Unnamed: 10,Unnamed: 11,Scat,Unnamed: 13,Unnamed: 14,90 G,90 F,200 G,200 F,280 G,280 F
0,Plot,Date,N,E,S,W,90,200,280,Plot,Species,% cover,Deer,Elk,Horse,,,,,,
1,2,5-Apr,2,1,1,1,F/G,F/G,F/G,,,,2,4,2,1.16,0.96,0.65,1.26,3.33,2.62


In [0]:
# reload data skipping first row
df_2017_spring = pd.read_csv(src_2017_spring, skiprows=1)

In [0]:
df_2017_spring.head(2)

Unnamed: 0,Plot,Date,N,E,S,W,90,200,280,Plot.1,Species,% cover,Deer,Elk,Horse,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20
0,2,5-Apr,2.0,1.0,1.0,1.0,F/G,F/G,F/G,,,,2.0,4.0,2.0,1.16,0.96,0.65,1.26,3.33,2.62
1,3,5-Apr,2.0,3.0,2.0,16.0,G,F/G,F/G,3-?,SAGE,40.0,0.0,3.0,1.0,2.84,0.0,6.21,12.78,4.26,0.25


In [0]:
# select columns to keep
df_2017_spring = df_2017_spring[['Plot', 'Date', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20']]

In [0]:
# rename columns
df_2017_spring.columns = ['grid_point', 'date', '90grass', '90forb', '200grass', '200forb', '280grass', '280forb']

In [0]:
df_2017_spring.head(2)

Unnamed: 0,grid_point,date,90grass,90forb,200grass,200forb,280grass,280forb
0,2,5-Apr,1.16,0.96,0.65,1.26,3.33,2.62
1,3,5-Apr,2.84,0.0,6.21,12.78,4.26,0.25


In [0]:
# average grass
df_2017_spring['grass'] = df_2017_spring[['90grass', '200grass', '280grass']].mean(axis=1)

In [0]:
# average forb
df_2017_spring['forb'] = df_2017_spring[['90forb', '200forb', '280forb']].mean(axis=1)

In [0]:
# pare DataFrame down to grass, forb averages
df_2017_spring = df_2017_spring[['grid_point', 'date', 'grass', 'forb']]

In [0]:
df_2017_spring.head(2)

Unnamed: 0,grid_point,date,grass,forb
0,2,5-Apr,1.713333,1.613333
1,3,5-Apr,4.436667,4.343333


In [0]:
# set value for 'year' column
df_2017_spring['year'] = 2017

In [0]:
# set value for 'season' column
df_2017_spring['season'] = 'spring'

In [0]:
df_2017_spring.head(2)

Unnamed: 0,grid_point,date,grass,forb,year,season
0,2,5-Apr,1.713333,1.613333,2017,spring
1,3,5-Apr,4.436667,4.343333,2017,spring


In [0]:
# restructure date format
df_2017_spring['date'] = pd.to_datetime(df_2017_spring.date + '-2017')

In [0]:
df_2017_spring.head(2)

Unnamed: 0,grid_point,date,grass,forb,year,season
0,2,2017-04-05,1.713333,1.613333,2017,spring
1,3,2017-04-05,4.436667,4.343333,2017,spring


In [0]:
# reorder columns
df_2017_spring = df_2017_spring[['grid_point', 'date', 'year', 'season', 'grass', 'forb']]

In [0]:
df_2017_spring.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,2,2017-04-05,2017,spring,1.713333,1.613333
1,3,2017-04-05,2017,spring,4.436667,4.343333


In [0]:
df_2017_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  135 non-null    int64         
 1   date        133 non-null    datetime64[ns]
 2   year        135 non-null    int64         
 3   season      135 non-null    object        
 4   grass       135 non-null    float64       
 5   forb        135 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 6.5+ KB


### Fall

In [0]:
# reload data skipping first row
df_2017_fall = pd.read_csv(src_2017_fall, skiprows=1)

In [0]:
# retrieve top level column names
df_2017_fall.columns

Index(['Plot', 'Date', 'N', 'E', 'S', 'W', '0', '170', '270', 'Plot.1',
       'Species', '% cover', 'Deer', 'Elk', 'Horse', 'grams', 'grams.1',
       'grams.2', 'grams.3', 'grams.4', 'grams.5'],
      dtype='object')

In [0]:
df_2017_fall.head(2)

Unnamed: 0,Plot,Date,N,E,S,W,0,170,270,Plot.1,Species,% cover,Deer,Elk,Horse,grams,grams.1,grams.2,grams.3,grams.4,grams.5
0,3,9-Oct,4.0,2.0,2.0,2.0,G,G,G,,,,1.0,2.0,0.0,2.9,0.0,,0.0,,0.0
1,4,9-Oct,6.0,3.0,2.0,4.0,FG,FG,G,,,,7.0,0.0,0.0,5.92,0.57,10.56,0.01,9.39,0.0


In [0]:
# select columns to keep
df_2017_fall = df_2017_fall[['Plot', 'Date', 'grams', 'grams.1', 'grams.2', 'grams.3', 'grams.4', 'grams.5']]

In [0]:
# rename columns
df_2017_fall.columns = ['grid_point', 'date', '0grass', '0forb', '170grass', '170forb', '270grass', '270forb']

In [0]:
df_2017_fall.head(2)

Unnamed: 0,grid_point,date,0grass,0forb,170grass,170forb,270grass,270forb
0,3,9-Oct,2.9,0.0,,0.0,,0.0
1,4,9-Oct,5.92,0.57,10.56,0.01,9.39,0.0


In [0]:
# average grass
df_2017_fall['grass'] = df_2017_fall[['0grass', '170grass', '270grass']].mean(axis=1)

In [0]:
# average forb
df_2017_fall['forb'] = df_2017_fall[['0forb', '170forb', '270forb']].mean(axis=1)

In [0]:
df_2017_fall.head(2)

Unnamed: 0,grid_point,date,0grass,0forb,170grass,170forb,270grass,270forb,grass,forb
0,3,9-Oct,2.9,0.0,,0.0,,0.0,2.9,0.0
1,4,9-Oct,5.92,0.57,10.56,0.01,9.39,0.0,8.623333,0.005


In [0]:
# pare DataFrame down to grass, forb averages
df_2017_fall = df_2017_fall[['grid_point', 'date', 'grass', 'forb']]

In [0]:
# set value for 'year'
df_2017_fall['year'] = 2017

In [0]:
# set value for 'season'
df_2017_fall['season'] = 'fall'

In [0]:
# restructure 'date' format
df_2017_fall['date'] = pd.to_datetime(df_2017_fall.date + '-2017')

In [0]:
df_2017_fall.head(2)

Unnamed: 0,grid_point,date,grass,forb,year,season
0,3,2017-10-09,2.9,0.0,2017,fall
1,4,2017-10-09,8.623333,0.005,2017,fall


In [0]:
# reorder columns
df_2017_fall = df_2017_fall[['grid_point', 'date', 'year', 'season', 'grass', 'forb']]

In [0]:
df_2017_fall.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,3,2017-10-09,2017,fall,2.9,0.0
1,4,2017-10-09,2017,fall,8.623333,0.005


In [0]:
df_2017_fall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  114 non-null    int64         
 1   date        114 non-null    datetime64[ns]
 2   year        114 non-null    int64         
 3   season      114 non-null    object        
 4   grass       113 non-null    float64       
 5   forb        112 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 5.5+ KB


### Full Year

In [0]:
df_2017_full_year = pd.concat([df_2017_spring, df_2017_fall], ignore_index=True, sort=False)

In [0]:
df_2017_full_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  249 non-null    int64         
 1   date        247 non-null    datetime64[ns]
 2   year        249 non-null    int64         
 3   season      249 non-null    object        
 4   grass       248 non-null    float64       
 5   forb        247 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 11.8+ KB


## 2016

### Spring

In [0]:
# load data
df_2016_spring = pd.read_csv(src_2016_spring)

In [0]:
df_2016_spring.head(2)

Unnamed: 0,Date,Plot,Degrees,Type,Weight,Notes
0,3/28/2016,2,50,forb,0.0,
1,3/28/2016,2,50,grass,2.58,


In [0]:
df_2016_spring.Degrees.unique()

array([ 50, 140, 260])

In [0]:
# remove 'Notes' column
df_2016_spring = df_2016_spring[['Date', 'Plot', 'Degrees', 'Type', 'Weight']]

In [0]:
df_2016_spring.head(2)

Unnamed: 0,Date,Plot,Degrees,Type,Weight
0,3/28/2016,2,50,forb,0.0
1,3/28/2016,2,50,grass,2.58


In [0]:
# average sample Degrees for each plot and date
# restructure to wide DataFrame
df_2016_spring = df_2016_spring.groupby(['Date', 'Plot', 'Type'])['Weight'].mean().unstack('Type').reset_index()

In [0]:
df_2016_spring.head(2)

Type,Date,Plot,forb,grass
0,3/28/2016,2,0.22,2.56
1,3/28/2016,3,0.683333,3.323333


In [0]:
# set 'year' column
df_2016_spring['year'] = 2016

# set 'season' column
df_2016_spring['season'] = 'spring'

In [0]:
df_2016_spring.head(2)

Type,Date,Plot,forb,grass,year,season
0,3/28/2016,2,0.22,2.56,2016,spring
1,3/28/2016,3,0.683333,3.323333,2016,spring


In [0]:
# coerce 'date' to datetime object
df_2016_spring['Date'] = pd.to_datetime(df_2016_spring['Date'])

In [0]:
# rename columns
df_2016_spring.columns = ['date', 'grid_point', 'forb', 'grass', 'year', 'season']

In [0]:
# reorder columns
df_2016_spring = df_2016_spring[['grid_point', 'date', 'year', 'season', 'grass', 'forb']]

In [0]:
df_2016_spring.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,2,2016-03-28,2016,spring,2.56,0.22
1,3,2016-03-28,2016,spring,3.323333,0.683333


In [0]:
df_2016_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  134 non-null    int64         
 1   date        134 non-null    datetime64[ns]
 2   year        134 non-null    int64         
 3   season      134 non-null    object        
 4   grass       131 non-null    float64       
 5   forb        133 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 6.4+ KB


### Fall

In [0]:
df_2016_fall = pd.read_csv(src_2016_fall)

In [0]:
df_2016_fall.head(2)

Unnamed: 0,Date,Plot,Degrees,Type,Weight
0,10/11/2016,2,10,forb,1.14
1,10/11/2016,2,10,grass,7.75


In [0]:
# average sample Degrees for each plot and date
# restructure to wide DataFrame
df_2016_fall = df_2016_fall.groupby(['Date', 'Plot', 'Type'])['Weight'].mean().unstack('Type').reset_index()

In [0]:
df_2016_fall.head(2)

Type,Date,Plot,forb,grass
0,10/10/2016,87,3.996667,7.046667
1,10/10/2016,98,0.0,9.086667


In [0]:
# coerce 'Date' to datetime object
df_2016_fall['Date'] = pd.to_datetime(df_2016_fall['Date'])

In [0]:
# set 'year' column
df_2016_fall['year'] = 2016

In [0]:
# set 'season' column
df_2016_fall['season'] = 'fall'

In [0]:
df_2016_fall.head(2)

Type,Date,Plot,forb,grass,year,season
0,2016-10-10,87,3.996667,7.046667,2016,fall
1,2016-10-10,98,0.0,9.086667,2016,fall


In [0]:
# rename columns
df_2016_fall.columns = ['date', 'grid_point', 'forb', 'grass', 'year', 'season']

In [0]:
df_2016_fall = df_2016_fall[['grid_point', 'date', 'year', 'season', 'grass', 'forb']]

In [0]:
df_2016_fall.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,87,2016-10-10,2016,fall,7.046667,3.996667
1,98,2016-10-10,2016,fall,9.086667,0.0


### Full Year 

In [0]:
df_2016_spring.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,2,2016-03-28,2016,spring,2.56,0.22
1,3,2016-03-28,2016,spring,3.323333,0.683333


In [0]:
df_2016_fall.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,87,2016-10-10,2016,fall,7.046667,3.996667
1,98,2016-10-10,2016,fall,9.086667,0.0


In [0]:
df_2016_full_year = pd.concat([df_2016_spring, df_2016_fall], ignore_index=True, sort=False)

In [0]:
df_2016_full_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  273 non-null    int64         
 1   date        273 non-null    datetime64[ns]
 2   year        273 non-null    int64         
 3   season      273 non-null    object        
 4   grass       265 non-null    float64       
 5   forb        269 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 12.9+ KB


## 2015

In [0]:
df_2010_15 = pd.read_csv(src_2010_15)

### Spring

In [0]:
df_2010_15.columns

Index(['GridPt', 'F10_1', 'F10_2', 'F10_3', 'F10_4', 'S11_1', 'S11_2', 'S11_3',
       'F11_1', 'F11_2', 'F11_3', 'S12_1', 'S12_2', 'S12_3', 'F12_1', 'F12_2',
       'F12_3', 'S13_1', 'S13_2', 'S13_3', 'F13_1', 'F13_2', 'F13_3', 'S14_1',
       'S14_2', 'S14_3', 'F14_1', 'F14_2', 'F14_3', 'S15_1', 'S15_2', 'S15_3',
       'F15_1', 'F15_2', 'F15_3'],
      dtype='object')

In [0]:
df_2015_spring = df_2010_15.loc[:, ['GridPt', 'S15_1', 'S15_2', 'S15_3']]

In [0]:
df_2015_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   S15_1   114 non-null    float64
 2   S15_2   114 non-null    float64
 3   S15_3   114 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 9.4 KB


In [0]:
df_2015_spring['mean'] = df_2015_spring.loc[:, ['S15_1', 'S15_2', 'S15_3']].mean(axis=1)

In [0]:
# set season
df_2015_spring['season'] = 'spring'

In [0]:
# slice out individual measurements
df_2015_spring = df_2015_spring.loc[:, ['GridPt', 'mean', 'season']]

In [0]:
df_2015_spring.head(2)

Unnamed: 0,GridPt,mean,season
0,2,1.903333,spring
1,3,1.083333,spring


### Fall

In [0]:
df_2010_15.columns

Index(['GridPt', 'F10_1', 'F10_2', 'F10_3', 'F10_4', 'S11_1', 'S11_2', 'S11_3',
       'F11_1', 'F11_2', 'F11_3', 'S12_1', 'S12_2', 'S12_3', 'F12_1', 'F12_2',
       'F12_3', 'S13_1', 'S13_2', 'S13_3', 'F13_1', 'F13_2', 'F13_3', 'S14_1',
       'S14_2', 'S14_3', 'F14_1', 'F14_2', 'F14_3', 'S15_1', 'S15_2', 'S15_3',
       'F15_1', 'F15_2', 'F15_3'],
      dtype='object')

In [0]:
df_2015_fall = df_2010_15.loc[:, ['GridPt', 'F15_1', 'F15_2', 'F15_3']]

In [0]:
df_2015_fall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   F15_1   119 non-null    float64
 2   F15_2   119 non-null    float64
 3   F15_3   116 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 9.4 KB


In [0]:
# average grid_point measurement values
df_2015_fall['mean'] = df_2015_fall.loc[:, ['F15_1', 'F15_2', 'F15_3']].mean(axis=1)

In [0]:
# set season
df_2015_fall['season'] = 'fall'

In [0]:
# slice out individual measurements
df_2015_fall = df_2015_fall.loc[:, ['GridPt', 'mean', 'season']]

In [0]:
df_2015_fall.head(2)

Unnamed: 0,GridPt,mean,season
0,2,6.873333,fall
1,3,5.91,fall


### Full Year

In [0]:
df_2015_spring.head(2)

Unnamed: 0,GridPt,mean,season
0,2,1.903333,spring
1,3,1.083333,spring


In [0]:
df_2015_fall.head(2)


Unnamed: 0,GridPt,mean,season
0,2,6.873333,fall
1,3,5.91,fall


In [0]:
# combine fall and spring dataframes
df_2015 = pd.concat([df_2015_spring, df_2015_fall], ignore_index=True, sort=False)

In [0]:
# set 'date' to NaT
df_2015['date'] = pd.NaT

# set 'year'
df_2015['year'] = 2015

In [0]:
df_2015.head()

Unnamed: 0,GridPt,mean,season,date,year
0,2,1.903333,spring,NaT,2015
1,3,1.083333,spring,NaT,2015
2,4,1.106667,spring,NaT,2015
3,5,2.426667,spring,NaT,2015
4,6,4.586667,spring,NaT,2015


## 2014

### Spring

In [0]:
df_2014_spring = df_2010_15.loc[:, ['GridPt', 'S14_1', 'S14_2', 'S14_3']]

In [0]:
df_2014_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   S14_1   32 non-null     float64
 2   S14_2   32 non-null     float64
 3   S14_3   32 non-null     float64
dtypes: float64(3), int64(1)
memory usage: 9.4 KB


In [0]:
# average individual measurements
df_2014_spring['mean'] = df_2014_spring.loc[:, ['S14_1', 'S14_2', 'S14_3']].mean(axis=1)

In [0]:
# slice out individual measurements
df_2014_spring = df_2014_spring.loc[:, ['GridPt', 'mean']]

In [0]:
# set season
df_2014_spring['season'] = 'spring'

In [0]:
df_2014_spring.dropna().head(2)

Unnamed: 0,GridPt,mean,season
0,2,1.59,spring
5,7,4.866667,spring


### Fall

In [0]:
df_2014_fall = df_2010_15.loc[:, ['GridPt', 'F14_1', 'F14_2', 'F14_3']]

In [0]:
df_2014_fall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   F14_1   63 non-null     float64
 2   F14_2   63 non-null     float64
 3   F14_3   63 non-null     float64
dtypes: float64(3), int64(1)
memory usage: 9.4 KB


In [0]:
# average individual measurements
df_2014_fall['mean'] = df_2014_fall.loc[:, ['F14_1', 'F14_2', 'F14_3']].mean(axis=1)

In [0]:
# slice to keep desired columns
df_2014_fall = df_2014_fall.loc[:, ['GridPt', 'mean']]

In [0]:
# set season
df_2014_fall['season'] = 'fall'

In [0]:
df_2014_fall.dropna().head(2)

Unnamed: 0,GridPt,mean,season
0,2,7.003333,fall
5,7,23.86,fall


### Full Year

In [0]:
df_2014_fall.head(2)

Unnamed: 0,GridPt,mean,season
0,2,7.003333,fall
1,3,,fall


In [0]:
df_2014_spring.head(2)

Unnamed: 0,GridPt,mean,season
0,2,1.59,spring
1,3,,spring


In [0]:
# combine fall and spring dataframes
df_2014 = pd.concat([df_2014_spring, df_2014_fall], ignore_index=True, sort=False)

In [0]:
# set 'date'
df_2014['date'] = pd.NaT

# set 'year'
df_2014['year'] = 2014

In [0]:
df_2014.dropna(subset=['mean']).head(2)

Unnamed: 0,GridPt,mean,season,date,year
0,2,1.59,spring,NaT,2014
5,7,4.866667,spring,NaT,2014


## 2013

### Spring

In [0]:
df_2013_spring = df_2010_15.loc[:, ['GridPt', 'S13_1', 'S13_2', 'S13_3']]

In [0]:
df_2013_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   S13_1   126 non-null    float64
 2   S13_2   128 non-null    float64
 3   S13_3   128 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 9.4 KB


In [0]:
df_2013_spring['mean'] = df_2013_spring.loc[:, ['S13_1', 'S13_2', 'S13_3']].mean(axis=1)

In [0]:
# subselect to remove individual measurements
df_2013_spring = df_2013_spring.loc[:, ['GridPt', 'mean']]

In [0]:
# set season
df_2013_spring['season'] = 'spring'

In [0]:
df_2013_spring.head(2)

Unnamed: 0,GridPt,mean,season
0,2,1.913333,spring
1,3,2.02,spring


### Fall

In [0]:
df_2013_fall = df_2010_15.loc[:, ['GridPt', 'F13_1', 'F13_2', 'F13_3']]

In [0]:
df_2013_fall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   F13_1   132 non-null    float64
 2   F13_2   129 non-null    float64
 3   F13_3   131 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 9.4 KB


In [0]:
df_2013_fall['mean'] = df_2013_fall.loc[:, ['F13_1', 'F13_2', 'F13_3']].mean(axis=1)

In [0]:
df_2013_fall = df_2013_fall.loc[:, ['GridPt', 'mean']]

In [0]:
# set 'season'
df_2013_fall['season'] = 'fall'

In [0]:
df_2013_fall.head(2)

Unnamed: 0,GridPt,mean,season
0,2,6.176667,fall
1,3,21.676667,fall


### Full Year

In [0]:
df_2013_fall.head(2)

Unnamed: 0,GridPt,mean,season
0,2,6.176667,fall
1,3,21.676667,fall


In [0]:
df_2013_spring.head(2)

Unnamed: 0,GridPt,mean,season
0,2,1.913333,spring
1,3,2.02,spring


In [0]:
# combine fall and spring dataframes
df_2013 = pd.concat([df_2013_spring, df_2013_fall], ignore_index=True, sort=False)

In [0]:
df_2013.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596 entries, 0 to 595
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  596 non-null    int64  
 1   mean    261 non-null    float64
 2   season  596 non-null    object 
dtypes: float64(1), int64(1), object(1)
memory usage: 14.1+ KB


In [0]:
# set 'date'
df_2013['date'] = pd.NaT

# set 'year'
df_2013['year'] = 2013

In [0]:
df_2013.head(2)

Unnamed: 0,GridPt,mean,season,date,year
0,2,1.913333,spring,NaT,2013
1,3,2.02,spring,NaT,2013


## 2012

### Spring

In [0]:
df_2012_spring = df_2010_15.loc[:, ['GridPt', 'S12_1', 'S12_2', 'S12_3']]

In [0]:
# this show that 'S12_3' did not read in as a float datatype
df_2012_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   S12_1   148 non-null    float64
 2   S12_2   148 non-null    float64
 3   S12_3   147 non-null    object 
dtypes: float64(2), int64(1), object(1)
memory usage: 9.4+ KB


In [0]:
# this reveals a string '  -'
df_2012_spring.S12_3.unique()

array([nan, '7.30', '0.61', '0.00', '10.78', '3.54', '1.86', '4.66',
       '6.45', '0.90', '5.92', '2.14', '0.53', '2.72', '2.27', '2.10',
       '9.03', '7.29', '4.12', '1.16', '0.81', '4.23', '1.94', '1.88',
       '7.41', '1.29', '3.85', '20.26', '5.52', '7.42', '2.12', '2.22',
       '0.44', '0.75', '0.66', '6.27', '1.39', '6.77', '10.27', '10.48',
       '7.09', '14.22', '3.06', '7.25', '0.14', '3.95', '4.11', '6.59',
       '3.83', '1.45', '22.97', '2.16', '2.60', '5.90', '11.75', '5.29',
       '4.20', '9.99', '2.59', '6.31', '1.08', '4.73', '20.19', '2.54',
       '10.40', '4.42', '0.54', '20.57', '3.20', '8.90', '7.02', '13.87',
       '  -', '8.57', '22.71', '12.96', '2.25', '5.24', '1.96', '8.05',
       '0.56', '2.68', '7.11', '13.09', '6.25', '7.23', '4.39', '0.67',
       '2.55', '10.04', '4.75', '8.39', '1.40', '2.01', '2.11', '3.13',
       '9.10', '0.89', '1.84', '5.11', '1.63', '8.30', '17.29', '0.52',
       '0.70', '18.43', '12.11', '16.81', '12.29', '3.33', '27.45

In [0]:
# find the row with ' -'
df_2012_spring[df_2012_spring.S12_3 == '  -']

Unnamed: 0,GridPt,S12_1,S12_2,S12_3
165,211,2.99,2.85,-


In [0]:
# replace the ' -' in the row above with NaN
df_2012_spring.loc[165, 'S12_3'] = np.nan

In [0]:
# Update datatype to float from object
df_2012_spring = df_2012_spring.astype({'S12_3': 'float'})

In [0]:
# Now the data types look right
df_2012_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   S12_1   148 non-null    float64
 2   S12_2   148 non-null    float64
 3   S12_3   146 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 9.4 KB


In [0]:
df_2012_spring['mean'] = df_2012_spring[['S12_1', 'S12_2', 'S12_3']].mean(axis=1)

In [0]:
df_2012_spring = df_2012_spring.loc[:, ['GridPt', 'mean']]

In [0]:
# set 'season' to spring
df_2012_spring['season'] = 'spring'

In [0]:
df_2012_spring.head(2)

Unnamed: 0,GridPt,mean,season
0,2,7.875,spring
1,3,4.726667,spring


### Fall

In [0]:
df_2012_fall = df_2010_15[['GridPt', 'F12_1', 'F12_2', 'F12_3']]

In [0]:
df_2012_fall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   F12_1   148 non-null    float64
 2   F12_2   148 non-null    float64
 3   F12_3   148 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 9.4 KB


In [0]:
df_2012_fall['mean'] = df_2012_fall.loc[:, ['F12_1', 'F12_2', 'F12_3']].mean(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [0]:
df_2012_fall = df_2012_fall.loc[:, ['GridPt', 'mean']]

In [0]:
# set 'season' to fall
df_2012_fall['season'] = 'fall'

In [0]:
df_2012_fall.head(2)

Unnamed: 0,GridPt,mean,season
0,2,13.003333,fall
1,3,11.276667,fall


### Full Year

In [0]:
df_2012_spring.head(2)

Unnamed: 0,GridPt,mean,season
0,2,7.875,spring
1,3,4.726667,spring


In [0]:
df_2012_fall.head(2)

Unnamed: 0,GridPt,mean,season
0,2,13.003333,fall
1,3,11.276667,fall


In [0]:
df_2012 = pd.concat([df_2012_spring, df_2012_fall], ignore_index=True, sort=False)

In [0]:
# set 'date'
df_2012['date'] = pd.NaT

# set 'year'
df_2012['year'] = 2012

In [0]:
df_2012.head(2)

Unnamed: 0,GridPt,mean,season,date,year
0,2,7.875,spring,NaT,2012
1,3,4.726667,spring,NaT,2012


## 2011

### Spring

In [0]:
df_2011_spring = df_2010_15.loc[:, ['GridPt', 'S11_1', 'S11_2', 'S11_3']]

In [0]:
# this shows that S11_3 is typed as an object
# further inspection shows a string value in "S11_3" of "BB"
df_2011_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   S11_1   69 non-null     float64
 2   S11_2   69 non-null     float64
 3   S11_3   69 non-null     object 
dtypes: float64(2), int64(1), object(1)
memory usage: 9.4+ KB


In [0]:
# The row with "BB"
df_2011_spring.loc[227, "S11_3"] = np.nan

In [0]:
# Update datatype to float from object
df_2011_spring = df_2011_spring.astype({'S11_3': 'float'})

In [0]:
df_2011_spring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GridPt  298 non-null    int64  
 1   S11_1   69 non-null     float64
 2   S11_2   69 non-null     float64
 3   S11_3   68 non-null     float64
dtypes: float64(3), int64(1)
memory usage: 9.4 KB


In [0]:
df_2011_spring['mean'] = df_2011_spring[['S11_1', 'S11_2', 'S11_3']].mean(axis=1)

In [0]:
df_2011_spring = df_2011_spring.loc[:, ['GridPt', 'mean']]

In [0]:
# set 'season' to spring
df_2011_spring['season'] = 'spring'

In [0]:
df_2011_spring.head(2)

Unnamed: 0,GridPt,mean,season
0,2,7.096667,spring
1,3,3.836667,spring


### Fall

In [0]:
df_2011_fall = df_2010_15.loc[:, ['GridPt', 'F11_1', 'F11_2', 'F11_3']]

In [0]:
df_2011_fall['mean'] = df_2011_fall[['F11_1', 'F11_2', 'F11_3']].mean(axis=1)

In [0]:
df_2011_fall = df_2011_fall.loc[:, ['GridPt', 'mean']]

In [0]:
df_2011_fall['season'] = 'fall'

In [0]:
df_2011_fall.head(2)

Unnamed: 0,GridPt,mean,season
0,2,8.99,fall
1,3,7.823333,fall


### Full Year

In [0]:
df_2011_spring.head(2)

Unnamed: 0,GridPt,mean,season
0,2,7.096667,spring
1,3,3.836667,spring


In [0]:
df_2011_fall.head(2)

Unnamed: 0,GridPt,mean,season
0,2,8.99,fall
1,3,7.823333,fall


In [0]:
df_2011 = pd.concat([df_2011_spring, df_2011_fall], ignore_index=True, sort=False)

In [0]:
# set 'date'
df_2011['date'] = pd.NaT

# set 'year'
df_2011['year'] = 2011

In [0]:
df_2011.head(2)

Unnamed: 0,GridPt,mean,season,date,year
0,2,7.096667,spring,NaT,2011
1,3,3.836667,spring,NaT,2011


## 2010

### Fall

In [0]:
# df_2010 - Fall
df_2010_fall = df_2010_15.loc[:, ['GridPt', 'F10_1', 'F10_2', 'F10_3', 'F10_4']]

In [0]:
# df_2010_fall['mean']
df_2010_fall['mean'] = df_2010_fall[['F10_1', 'F10_2', 'F10_3', 'F10_4']].mean(axis=1)

In [0]:
df_2010_fall = df_2010_fall.loc[:, ['GridPt', 'mean']]

In [0]:
# set 'season' to fall
df_2010_fall['season'] = 'fall'

In [0]:
df_2010_fall.head(2)

Unnamed: 0,GridPt,mean,season
0,2,,fall
1,3,7.305,fall


### Full Year
* 2010 limited to Fall measurements

In [0]:
df_2010 = df_2010_fall.loc[:]

In [0]:
# set 'date'
df_2010['date'] = pd.NaT

# set 'year'
df_2010['year'] = 2010

In [0]:
df_2010.head(2)

Unnamed: 0,GridPt,mean,season,date,year
0,2,,fall,NaT,2010
1,3,7.305,fall,NaT,2010


# Concatenate Date Ranges

## 2019

In [0]:
df_2019.head(2)

Unnamed: 0,grid_point,date,pooled_g,year,season
0,9,2019-09-24,9.113296,2019,fall
1,10,2019-09-24,17.941478,2019,fall


In [0]:
# set grass and forb to NaN
df_2019['grass'] = np.nan
df_2019['forb'] = np.nan

In [0]:
# reorder columns to match specification
df_2019 = df_2019.loc[:, ['grid_point', 'date', 'year', 'season', 'grass', 'forb', 'pooled_g']]

In [0]:
# remove rows with nan in 'pooled_g'
df_2019 = df_2019.loc[df_2019.pooled_g.notnull()]

In [0]:
# update column names
df_2019.columns = ['grid_point', 'date', 'year', 'season', 'grass_g', 'forb_g', 'pooled_g']

In [0]:
df_2019.head(2)

Unnamed: 0,grid_point,date,year,season,grass_g,forb_g,pooled_g
0,9,2019-09-24,2019,fall,,,9.113296
1,10,2019-09-24,2019,fall,,,17.941478


## 2016 - 2018

In [0]:
df_2018_full_year.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,3,2018-04-10,2018,spring,13.44,0.0
1,4,2018-04-10,2018,spring,5.986667,0.0


In [0]:
df_2017_full_year.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,2,2017-04-05,2017,spring,1.713333,1.613333
1,3,2017-04-05,2017,spring,4.436667,4.343333


In [0]:
df_2016_full_year.head(2)

Unnamed: 0,grid_point,date,year,season,grass,forb
0,2,2016-03-28,2016,spring,2.56,0.22
1,3,2016-03-28,2016,spring,3.323333,0.683333


In [0]:
# combine annual DataFrames
df_2016_2018 = pd.concat([df_2018_full_year, df_2017_full_year, df_2016_full_year], ignore_index=True, sort=False)

In [0]:
# expand to include pooled_g column and set to NaN
df_2016_2018['pooled_g'] = np.nan

In [0]:
# update column names
df_2016_2018.columns = ['grid_point', 'date', 'year', 'season', 'grass_g', 'forb_g', 'pooled_g']

In [0]:
df_2016_2018.head(2)

Unnamed: 0,grid_point,date,year,season,grass_g,forb_g,pooled_g
0,3,2018-04-10,2018,spring,13.44,0.0,
1,4,2018-04-10,2018,spring,5.986667,0.0,


In [0]:
df_2016_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  683 non-null    int64         
 1   date        680 non-null    datetime64[ns]
 2   year        683 non-null    int64         
 3   season      683 non-null    object        
 4   grass_g     674 non-null    float64       
 5   forb_g      677 non-null    float64       
 6   pooled_g    0 non-null      float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(1)
memory usage: 37.5+ KB


## 2010 - 2015

In [0]:
df_2010_2015 = pd.concat([df_2010, df_2011, df_2012, df_2013, df_2014, df_2015], ignore_index=True, sort=False)

In [0]:
# rename 'mean' to 'pooled_g'
df_2010_2015.columns = ['grid_point', 'pooled_g', 'season', 'date', 'year']

In [0]:
# set grass to NaN
df_2010_2015['grass'] = np.nan

# set forb to Nan
df_2010_2015['forb'] = np.nan

In [0]:
# reorder columns to match template schema
df_2010_2015 = df_2010_2015[['grid_point', 'date', 'year', 'season', 'grass', 'forb', 'pooled_g']]

In [0]:
# remove rows with 'pooled_g' NaN
df_2010_2015 = df_2010_2015.loc[df_2010_2015.pooled_g.notnull()]

In [0]:
df_2010_2015.columns = ['grid_point', 'date', 'year', 'season', 'grass_g', 'forb_g', 'pooled_g']

In [0]:
df_2010_2015.head(2)

Unnamed: 0,grid_point,date,year,season,grass_g,forb_g,pooled_g
1,3,NaT,2010,fall,,,7.305
2,4,NaT,2010,fall,,,1.5125


# 2010 - 2020

In [0]:
df_2010_2020 = pd.concat([df_2020_spring, df_2019, df_2016_2018, df_2010_2015], ignore_index=True, sort=False)

In [0]:
df_2010_2020.head()

Unnamed: 0,grid_point,date,year,season,grass_g,forb_g,pooled_g
0,9,,2020,spring,0.0,0.0,5.063333
1,10,,2020,spring,0.0,0.0,2.87
2,11,,2020,spring,0.0,0.0,5.806667
3,12,,2020,spring,0.0,0.0,7.173333
4,19,,2020,spring,0.0,0.0,5.35


In [0]:
df_2010_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2458 entries, 0 to 2457
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   grid_point  2458 non-null   int64  
 1   date        989 non-null    object 
 2   year        2458 non-null   int64  
 3   season      2458 non-null   object 
 4   grass_g     836 non-null    float64
 5   forb_g      839 non-null    float64
 6   pooled_g    1775 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 134.5+ KB


# Push to BigQuery

[Documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html)

In [0]:
# BE CAREFUL RUNNING THIS CELL
# df_2010_2020.to_gbq('vegetation_biomass.vegetation_biomass', 'mpg-data-warehouse', if_exists='replace')

1it [00:10, 10.36s/it]
