<a href="https://colab.research.google.com/github/samsoe/mpg_notebooks/blob/master/horizontal_cover_robel_wrangle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description

## Documentation
* [Readme vegetation biomass](https://docs.google.com/document/d/1AF8qh7YRqEJ_5DjX0GrA3Z52RfGp2f22Jc2uLonGa2I/edit#heading=h.4py25fcybipz)

## Schema

* grid_point: INT
* date: DATETIME
* year: DATE ("yyyy")
* season: STRING ("spring", "fall")
* robel_avg: INT

# Load Tools

In [0]:
import pandas as pd

# Data Source

## 2020

In [0]:
# '2020-04-21_spring_biomass_data - Sheet1.csv'
src_2020_spring = 'https://drive.google.com/uc?id=1Q1SvzROHySWxIjG7qSFYK9luwJB6RIJ0'

## 2019

In [0]:
# '2019 Biomass Data.xlsx'
# url = 'https://drive.google.com/open?id=1dd5F8R-mD6l7H2morac1r9ongf80xpff'
src_2019 = 'https://drive.google.com/uc?id=1dd5F8R-mD6l7H2morac1r9ongf80xpff'
sheet_2019 = 'All Data - redone'

## 2018

In [0]:
# 'MPG vegetation biomass 2010-1018.xlsx'
# url = 'https://drive.google.com/open?id=1a9Kzg6DF8c4KEkCzBVFaBOjzW5zp2B0L'
src_2018 = 'https://drive.google.com/uc?id=1a9Kzg6DF8c4KEkCzBVFaBOjzW5zp2B0L'
sheet_2018_spring = '2018 Spring'
sheet_2018_fall = '2018 Fall'

## 2017

In [0]:
# 'MPG vegetation biomass 2010-1018.xlsx'
# url = 'https://drive.google.com/open?id=1a9Kzg6DF8c4KEkCzBVFaBOjzW5zp2B0L'
src_2017 = 'https://drive.google.com/uc?id=1a9Kzg6DF8c4KEkCzBVFaBOjzW5zp2B0L'
sheet_2017_spring = '2017 Spring'
sheet_2017_fall = '2017 Fall'

## 2016

In [0]:
# MPGBiomass_2016_013017.xlsx
src_2016 = 'https://drive.google.com/uc?id=1t2JdFItRJRY8PjjSGcm3FaPgB64qMLG3'
sheet_2016_spring = 'Spring16_Pellets-Robel'
sheet_2016_fall = 'Fall16_Pellets-Robel'

## 2011 - 2015

In [0]:
# 'MPGBiomass_2010-2015_MASTER_031116.xlsx'
src_2011_2015 = 'https://drive.google.com/uc?id=1xk-TCLUmxDL0DcVe8KDBhi8ldn1S9_ni'
sheet_2011_2015 = 'Non-RestorationPts-Avg_SPSS'

# Wrangle Data

## 2020

### Spring

In [0]:
df_2020_spring = pd.read_csv(src_2020_spring)

In [0]:
# subselect columns of interest
df_2020_spring = df_2020_spring.loc[:, ['Plot', 'year', 'season', 'robel_N', 'robel_E', 'robel_S', 'robel_W']]

In [0]:
# average robel columns
df_2020_spring['robel_avg'] = df_2020_spring.loc[:, ['robel_N', 'robel_E', 'robel_S', 'robel_W']].mean(axis=1)

In [0]:
# subselect to remove columns used in generating average
df_2020_spring = df_2020_spring.loc[:, ['Plot', 'year', 'season', 'robel_avg']]

In [0]:
# rename columns to conform to schema
df_2020_spring.columns = ['grid_point', 'year', 'season', 'robel_avg']

In [0]:
# dates are not available fill with NaT
df_2020_spring['date'] = pd.NaT

In [0]:
# reorder columns to match schema
df_2020_spring = df_2020_spring.loc[:, ['grid_point', 'date', 'year', 'season', 'robel_avg']]

In [108]:
df_2020_spring.head(2)

Unnamed: 0,grid_point,date,year,season,robel_avg
0,9,NaT,2020,spring,1.25
1,10,NaT,2020,spring,7.75


## 2019

In [0]:
df_2019 = pd.read_excel(src_2019, sheet_name=sheet_2019)

In [110]:
df_2019.head(2)

Unnamed: 0,Plot,Date,Robel Reading,Unnamed: 3,Unnamed: 4,Unnamed: 5,Biomass Weight (g-spring)(pennyweight for fall),Unnamed: 7,Unnamed: 8,Deer,Elk,Horse,Biomass,Unnamed: 13,Robel,Unnamed: 15,Unnamed: 16
0,9,2019-09-24,4.0,8.0,6.0,14.0,6.82,5.78,4.98,2.0,3.0,2.0,23.44,36.451544,8.0,,23.44
1,10,2019-09-24,6.0,4.0,2.0,4.0,15.95,1.36,17.3,0.0,1.0,2.0,46.146667,71.762681,4.0,,93.76


In [111]:
df_2019.columns

Index(['Plot', 'Date', 'Robel Reading', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', '   Biomass Weight (g-spring)(pennyweight for fall) ',
       'Unnamed: 7', 'Unnamed: 8', 'Deer', 'Elk', 'Horse', 'Biomass',
       'Unnamed: 13', 'Robel', 'Unnamed: 15', 'Unnamed: 16'],
      dtype='object')

In [0]:
# slice to pertinent columns
df_2019 = df_2019.loc[:, ['Plot', 'Date', 'Robel']]

In [113]:
df_2019.head(2)

Unnamed: 0,Plot,Date,Robel
0,9,2019-09-24,8.0
1,10,2019-09-24,4.0


In [0]:
# standarize column names
df_2019.columns = ['grid_point', 'date', 'robel_avg']

In [115]:
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347 entries, 0 to 346
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  347 non-null    int64         
 1   date        325 non-null    datetime64[ns]
 2   robel_avg   319 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 8.3 KB


In [116]:
# date is missing in this row
# the row is in the midst of a consistent stream of 9/24/2019
df_2019.loc[(df_2019.date.isna()) & (~df_2019.robel_avg.isna())]

Unnamed: 0,grid_point,date,robel_avg
102,197,NaT,14.0


In [0]:
df_2019.loc[(df_2019.date.isna()) & (~df_2019.robel_avg.isna()), 'date'] = pd.Timestamp('2019-09-24')

In [118]:
df_2019.iloc[102]

grid_point                    197
date          2019-09-24 00:00:00
robel_avg                      14
Name: 102, dtype: object

In [0]:
# set seasons
## spring
df_2019.loc[(df_2019.date < '2019-06'), 'season'] = 'spring'

## fall
df_2019.loc[(df_2019.date > '2019-06'), 'season'] = 'fall'

In [0]:
# set year
df_2019['year'] = 2019

In [0]:
# order columns
df_2019 = df_2019[['grid_point', 'date', 'year', 'season', 'robel_avg']]

In [122]:
df_2019.head(2)

Unnamed: 0,grid_point,date,year,season,robel_avg
0,9,2019-09-24,2019,fall,8.0
1,10,2019-09-24,2019,fall,4.0


## 2018

### Spring

In [0]:
df_2018_spring = pd.read_excel(src_2018, sheet_name=sheet_2018_spring, skiprows=1)

In [124]:
df_2018_spring.head(2)

Unnamed: 0,Plot,Date,N,E,S,W,60,180,330,Plot.1,Species,% cover,Deer,Elk,Horse,grams,grams.1,grams.2,grams.3,grams.4,grams.5,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,3,2018-04-10,2.0,4.0,2.0,2.0,G,G,G,,,,2.0,0.0,1.0,9.33,0.0,20.55,0.0,10.44,0.0,,,,,
1,4,2018-04-10,2.0,2.0,4.0,2.0,G,G,G,,,,2.0,6.0,0.0,2.31,0.0,3.44,0.0,12.21,0.0,,,,,


In [0]:
# keep these columns
df_2018_spring = df_2018_spring.loc[:, ['Plot', 'Date', 'N', 'E', 'S', 'W']]

In [126]:
df_2018_spring.head(2)

Unnamed: 0,Plot,Date,N,E,S,W
0,3,2018-04-10,2.0,4.0,2.0,2.0
1,4,2018-04-10,2.0,2.0,4.0,2.0


### Fall

In [0]:
df_2018_fall = pd.read_excel(src_2018, sheet_name=sheet_2018_fall, skiprows=1)

In [0]:
df_2018_fall = df_2018_fall.loc[:, ['Plot', 'Date', 'N', 'E', 'S', 'W']]

In [129]:
df_2018_fall.head(2)

Unnamed: 0,Plot,Date,N,E,S,W
0,70,2018-10-15,14,7,8,3
1,71,2018-10-15,1,1,1,2


### Full Year

In [0]:
df_2018 = pd.concat([df_2018_spring, df_2018_fall], ignore_index=True, sort=False)

In [0]:
# average measurement columns
df_2018['robel_avg'] = df_2018.loc[:, ['N', 'E', 'S', 'W']].mean(axis=1)

In [0]:
# slice to keep pertinent columns
df_2018 = df_2018.loc[:, ['Plot', 'Date', 'robel_avg']]

In [0]:
# standardize column names
df_2018.columns = ['grid_point', 'date', 'robel_avg']

In [0]:
# set column 'year' value
df_2018['year'] = 2018

In [0]:
# set column 'season'
## spring
df_2018.loc[(df_2018.date < '2018-06'), 'season'] = 'spring'

## fall
df_2018.loc[(df_2018.date > '2018-06'), 'season'] = 'fall'

In [0]:
# order columns
df_2018 = df_2018[['grid_point', 'date', 'year', 'season', 'robel_avg']]

In [137]:
df_2018.head(2)

Unnamed: 0,grid_point,date,year,season,robel_avg
0,3,2018-04-10,2018,spring,3.0
1,4,2018-04-10,2018,spring,2.0


## 2017

### Spring

In [0]:
df_2017_spring = pd.read_excel(src_2017, sheet_name = sheet_2017_spring, skiprows=1)

In [0]:
df_2017_spring = df_2017_spring.loc[:, ['Plot', 'Date', 'N', 'E', 'S', 'W']]

In [140]:
df_2017_spring.head(2)

Unnamed: 0,Plot,Date,N,E,S,W
0,2,2017-04-05,2.0,1.0,1.0,1.0
1,3,2017-04-05,2.0,3.0,2.0,16.0


### Fall

In [0]:
df_2017_fall = pd.read_excel(src_2017, sheet_name = sheet_2017_fall, skiprows=1)

In [0]:
df_2017_fall= df_2017_fall.loc[:, ['Plot', 'Date', 'N', 'E', 'S', 'W']]

In [143]:
df_2017_fall.head(2)

Unnamed: 0,Plot,Date,N,E,S,W
0,3,2017-10-09,4.0,2.0,2.0,2.0
1,4,2017-10-09,6.0,3.0,2.0,4.0


### Full Year

In [0]:
# combine spring and fall dataframes
df_2017 = pd.concat([df_2017_spring, df_2017_fall], ignore_index=True, sort=False)

In [145]:
df_2017.head(2)

Unnamed: 0,Plot,Date,N,E,S,W
0,2,2017-04-05,2.0,1.0,1.0,1.0
1,3,2017-04-05,2.0,3.0,2.0,16.0


In [0]:
# remove nulls
df_2017 = df_2017.loc[~df_2017.Date.isnull()]

In [0]:
# set 'year' column
df_2017['year'] = 2017

In [0]:
# set 'season' column

## spring
df_2017.loc[df_2017.Date < '2017-06', 'season'] = 'spring'
## fall
df_2017.loc[df_2017.Date > '2017-06', 'season'] = 'fall'

In [0]:
# average measurement columns
df_2017['robel_avg'] = df_2017.loc[:, ['N', 'E', 'S', 'W']].mean(axis=1)

In [0]:
df_2017 = df_2017[['Plot', 'Date', 'year', 'season', 'robel_avg']]

In [0]:
# rename columns to standard
df_2017.columns = ['grid_point', 'date', 'year', 'season', 'robel_avg']

In [152]:
df_2017.head(2)

Unnamed: 0,grid_point,date,year,season,robel_avg
0,2,2017-04-05,2017,spring,1.25
1,3,2017-04-05,2017,spring,5.75


## 2016

### Spring

In [0]:
# load source into DataFrame
df_2016_spring = pd.read_excel(src_2016, sheet_name=sheet_2016_spring)

In [0]:
# remove null point values
df_2016_spring = df_2016_spring[df_2016_spring.Point.notnull()]

In [0]:
# select for Robel values
df_2016_spring = df_2016_spring.loc[:, ['Point', 'AvgRobel']]

In [0]:
# rename columns
df_2016_spring.columns = ['grid_point', 'robel_avg']

In [0]:
# set value for date
df_2016_spring['date'] = pd.NaT

# set value for season
df_2016_spring['season'] = 'spring'

# set value for year
df_2016_spring['year'] = 2016

In [0]:
# reorder columns
df_2016_spring = df_2016_spring.loc[:, ['grid_point', 'date', 'year', 'season', 'robel_avg']]

In [0]:
# coerce grid_point datatype from float to int
df_2016_spring = df_2016_spring.astype({'grid_point': 'int32'})

In [160]:
df_2016_spring.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 134 entries, 0 to 299
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  134 non-null    int32         
 1   date        0 non-null      datetime64[ns]
 2   year        134 non-null    int64         
 3   season      134 non-null    object        
 4   robel_avg   134 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(1)
memory usage: 5.8+ KB


### Fall

In [0]:
# load source into DataFrame
df_2016_fall = pd.read_excel(src_2016, sheet_2016_fall)

In [0]:
# remove NA Point rows
df_2016_fall = df_2016_fall[df_2016_fall.Point.notnull()]

In [0]:
# select robel values drop the rest
df_2016_fall = df_2016_fall.loc[:, ['Point', 'AvgRobel']]

In [0]:
# coerce 'Point' column from float to int datatype
df_2016_fall = df_2016_fall.astype({'Point': 'int32'})

In [0]:
# rename columns
df_2016_fall.columns = ['grid_point', 'robel_avg']

In [0]:
# set value for date
df_2016_fall['date'] = pd.NaT

# set value for season
df_2016_fall['season'] = 'fall'

# set value for year
df_2016_fall['year'] = 2016

In [0]:
# reorder columns to match schema
df_2016_fall = df_2016_fall.loc[:, ['grid_point', 'date', 'year', 'season', 'robel_avg']]

In [168]:
df_2016_fall.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133 entries, 0 to 299
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  133 non-null    int32         
 1   date        0 non-null      datetime64[ns]
 2   year        133 non-null    int64         
 3   season      133 non-null    object        
 4   robel_avg   133 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(1)
memory usage: 5.7+ KB


### Full Year

In [0]:
df_2016 = pd.concat([df_2016_spring, df_2016_fall], ignore_index=True, sort=False)

In [170]:
df_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267 entries, 0 to 266
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  267 non-null    int32         
 1   date        0 non-null      datetime64[ns]
 2   year        267 non-null    int64         
 3   season      267 non-null    object        
 4   robel_avg   267 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(1)
memory usage: 9.5+ KB


## 2011 - 2015

In [0]:
# load 2011-2015 robel data into DataFrame
df_2011_2015 = pd.read_excel(src_2011_2015, sheet_name=sheet_2011_2015)

In [0]:
# subselect to robel
df_2011_2015 = df_2011_2015.loc[:, ['GridPt', 'Season', 'Year', 'Robel']]

In [0]:
# 2010 does not contain any robel data remove it
df_2011_2015 = df_2011_2015[(df_2011_2015.Year > 2010)]

In [0]:
# subselect to rows with Robel data
df_2011_2015 = df_2011_2015[df_2011_2015.Robel.notnull()]

In [175]:
df_2011_2015.head(2)

Unnamed: 0,GridPt,Season,Year,Robel
218,2,Fall,2011,0.0
219,3,Fall,2011,3.333333


In [0]:
# 'grid_point', 'date', 'year', 'season', 'robel_avg'
df_2011_2015.columns = ['grid_point', 'season', 'year', 'robel_avg']

In [0]:
# add date column with NaT
df_2011_2015['date'] = pd.NaT

In [0]:
# update capitalization on season strings
df_2011_2015.loc[(df_2011_2015.season == 'Fall'), 'season'] = 'fall'
df_2011_2015.loc[(df_2011_2015.season == 'Spring'), 'season'] = 'spring'

In [0]:
# reorder columns to match schema
df_2011_2015 = df_2011_2015.loc[:, ['grid_point', 'date', 'year', 'season', 'robel_avg']]

In [180]:
df_2011_2015.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 584 entries, 218 to 2397
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  584 non-null    int64         
 1   date        0 non-null      datetime64[ns]
 2   year        584 non-null    int64         
 3   season      584 non-null    object        
 4   robel_avg   584 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 27.4+ KB


# Concatenate all Years

In [0]:
df_2011_2020 = pd.concat([df_2020_spring, df_2019, df_2018, df_2017, df_2016, df_2011_2015], ignore_index=True, sort=False)

In [186]:
df_2011_2020.year.unique()

array([2020, 2019, 2018, 2017, 2016, 2011, 2012, 2013, 2014, 2015])

In [187]:
df_2011_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1768 entries, 0 to 1767
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  1768 non-null   int64         
 1   date        733 non-null    datetime64[ns]
 2   year        1768 non-null   int64         
 3   season      1746 non-null   object        
 4   robel_avg   1738 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 69.2+ KB


In [0]:
# remove rows with null 'robel_avg'
df_2011_2020 = df_2011_2020.loc[df_2011_2020['robel_avg'].notnull()]

In [189]:
df_2011_2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1738 entries, 0 to 1767
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   grid_point  1738 non-null   int64         
 1   date        725 non-null    datetime64[ns]
 2   year        1738 non-null   int64         
 3   season      1738 non-null   object        
 4   robel_avg   1738 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 81.5+ KB


# Push to BigQuery

In [190]:
df_2011_2020.to_gbq('vegetation_biomass.horizontal_cover_robel', 'mpg-data-warehouse', if_exists='replace')

1it [00:21, 21.29s/it]
