In [36]:
# concatenate biomass for all years in column format ['gridpt', 'year', 'season', 'avg_mass']
# averages are in unit grams(g)

# Load Tools

In [532]:
import pandas as pd
import glob
import numpy as np
from functools import reduce

# Collect Data Sheets

In [12]:
path = '../src/biomass'
all_files = glob.glob(path + "/*.csv")

In [13]:
all_files

['../src/biomass/MPG vegetation biomass 2010-2018 - 2017 Spring.csv',
 '../src/biomass/MPG vegetation biomass 2010-2018 - 2018 Spring.csv',
 '../src/biomass/MPG vegetation biomass 2010-2018 - 2017 Fall.csv',
 '../src/biomass/MPG vegetation biomass 2010-2018 - 2016 Fall.csv',
 '../src/biomass/MPG vegetation biomass 2010-2018 - 2018 Fall.csv',
 '../src/biomass/MPG vegetation biomass 2010-2018 - 2016 Spring.csv',
 '../src/biomass/MPG vegetation biomass 2010-2018 - 2010-2015.csv']

# Read into DataFrames

In [15]:
li = []

In [16]:
for filename in all_files:
    df = pd.read_csv(filename)
    li.append(df)

# Process DataFrames

### 2017 Spring Sheet

In [554]:
# drop untargeted rows
df = li[0].drop(columns=['Unnamed: 1', 'Robel', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Biomass', 'Unnamed: 7', 'Unnamed: 8', 'Woody plants',
       'Unnamed: 10', 'Unnamed: 11', 'Scat', 'Unnamed: 13', 'Unnamed: 14'])

In [555]:
# rename columns
df = df.rename(columns={
                    'Unnamed: 0': 'gridpt',
                })

In [556]:
# remove extra header row
df = df.drop([0])

In [557]:
# ensure all values are numeric
cols = df.columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [558]:
# add F(forbes) and G(grass) together for total biomass at each point
sites = [90, 200, 280]
for site in sites:
    df[site] = df.loc[:, str(site)+' G'] + df.loc[:, str(site)+' F']

In [559]:
# drop G and F columns used for total biomass
df = df.drop(columns=['90 G', '90 F', '200 G', '200 F', '280 G', '280 F'])

In [560]:
# reshape dataframe to long format
df = pd.melt(df, id_vars=['gridpt'])

In [561]:
# calculate average for the 3 sites at each gridpt
grouped = df.groupby(['gridpt']).mean()

In [562]:
grouped = grouped.reset_index()

In [563]:
# add this before concatenation to main dataframe
grouped['year'] = 2017
grouped['season'] = 'spring'

In [565]:
df = grouped[['gridpt'
             , 'year'
             , 'season'
             , 'value']]
df.rename(columns={'value': 'pt_avg'}, inplace=True)

In [285]:
# build new year archive
new_li = [df]

### 2018 Spring

In [286]:
df = li[1].drop(columns=['Unnamed: 1', 'Robel', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Biomass', 'Unnamed: 7', 'Unnamed: 8', 'Woody plants',
       'Unnamed: 10', 'Unnamed: 11', 'Scat', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 21',
       'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25'])

In [287]:
# rename columns
df = df.rename(columns={
                    'Unnamed: 0': 'gridpt',
                })

In [288]:
# remove extra header row
df = df.drop([0])

In [290]:
# ensure all values are numeric
cols = df.columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [291]:
# add F(forbes) and G(grass) together for total biomass at each point
sites = [60, 180, 330]
for site in sites:
    df[site] = df.loc[:, str(site)+'G'] + df.loc[:, str(site)+'F']

In [293]:
# drop G and F columns used for total biomass
df = df.drop(columns=['60G', '60F', '180G', '180F', '330G', '330F'])

In [296]:
# reshape dataframe to long format
df = pd.melt(df, id_vars=['gridpt'])

In [298]:
grouped = df.groupby(['gridpt']).mean()

In [300]:
grouped = grouped.reset_index()

In [301]:
# add this before concatenation to main dataframe
grouped['year'] = 2018
grouped['season'] = 'spring'

In [302]:
df = grouped[['gridpt'
             , 'year'
             , 'season'
             , 'value']]
df.rename(columns={'value': 'pt_avg'}, inplace=True)

In [304]:
new_li.append(df)

### 2017 Fall

In [312]:
df = li[2].drop(columns=['Unnamed: 1', 'Robel', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Biomass', 'Unnamed: 7', 'Unnamed: 8', 'Woody plants',
       'Unnamed: 10', 'Unnamed: 11', 'Scat', 'Unnamed: 13', 'Unnamed: 14'
                   ])

In [313]:
# remove extra header row
df = df.drop([0])

In [322]:
# rename columns
df = df.rename(columns={
                    'Unnamed: 0': 'gridpt',
                })

In [315]:
# ensure all values are numeric
cols = df.columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [318]:
# add F(forbes) and G(grass) together for total biomass at each point
sites = [0, 170, 270]
for site in sites:
    df[site] = df.loc[:, str(site)+'G'] + df.loc[:, str(site)+'F']

In [320]:
# drop G and F columns used for total biomass
df = df.drop(columns=['0G', '0F', '170G', '170F', '270G', '270F'])

In [324]:
# reshape dataframe to long format
df = pd.melt(df, id_vars=['gridpt'])

In [326]:
grouped = df.groupby(['gridpt']).mean()

In [330]:
grouped = grouped.reset_index()

In [331]:
# add this before concatenation to main dataframe
grouped['year'] = 2017
grouped['season'] = 'fall'

In [332]:
df = grouped[['gridpt'
             , 'year'
             , 'season'
             , 'value']]
df.rename(columns={'value': 'pt_avg'}, inplace=True)

In [334]:
new_li.append(df)

### 2016 Fall

In [356]:
li[3].head(2)

Unnamed: 0,Date,Plot,Degrees,Type,Weight
0,10/11/2016,2,10,forb,1.14
1,10/11/2016,2,10,grass,7.75


In [336]:
df = li[3].drop(columns=['Date'])

In [338]:
# ensure weight values are all numeric
df['Weight'] = df['Weight'].apply(pd.to_numeric, errors='coerce')

In [341]:
# sum forb and grass types for each plot
df = df.groupby(['Plot', 'Degrees']).sum()

In [345]:
# average sample sites for each plot
df = df.groupby(['Plot']).mean()

In [347]:
df['year'] = 2016
df['season'] = 'fall'

In [349]:
df.reset_index(inplace=True)

In [353]:
df = df[['Plot'
         , 'year'
         , 'season'
         , 'Weight']]
df.rename(columns={'Plot': 'gridpt', 'Weight': 'pt_avg'}, inplace=True)

In [357]:
new_li.append(df)

### 2018 Fall

In [361]:
df = li[4].drop(columns=['Unnamed: 1', 'Robel', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Biomass', 'Unnamed: 7', 'Unnamed: 8', 'Woody plants',
       'Unnamed: 10', 'Unnamed: 11', 'Scat', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 21'])

In [363]:
# remove extra header row
df = df.drop([0])

In [364]:
# rename columns
df = df.rename(columns={
                    'Unnamed: 0': 'gridpt',
                })

In [369]:
# ensure all values are numeric
cols = df.columns
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [371]:
# add F(forbes) and G(grass) together for total biomass at each point
sites = [110, 250, 350]
for site in sites:
    df[site] = df.loc[:, str(site)+'G'] + df.loc[:, str(site)+'F']

In [373]:
# drop G and F columns used for total biomass
df = df.drop(columns=['110G', '110F', '250G', '250F', '350G', '350F'])

In [375]:
# reshape dataframe to long format
df = pd.melt(df, id_vars=['gridpt'])

In [380]:
df = df.groupby(['gridpt']).mean()

In [381]:
df.reset_index(inplace=True)

In [383]:
df['year'] = 2018
df['season'] = 'fall'

In [385]:
df = df[['gridpt', 'year', 'season', 'value']]
df.rename(columns={'value': 'pt_avg'}, inplace=True)

In [387]:
new_li.append(df)

### Spring 2016

In [390]:
df = li[5].drop(columns=['Date', 'Notes'])

In [392]:
# ensure weight values are all numeric
df['Weight'] = df['Weight'].apply(pd.to_numeric, errors='coerce')

In [395]:
# sum forb and grass types for each plot
df = df.groupby(['Plot', 'Degrees']).sum()

In [396]:
# average sample sites for each plot
df = df.groupby(['Plot']).mean()

In [397]:
df['year'] = 2016
df['season'] = 'spring'

In [399]:
df.reset_index(inplace=True)

In [400]:
df = df[['Plot'
         , 'year'
         , 'season'
         , 'Weight']]
df.rename(columns={'Plot': 'gridpt', 'Weight': 'pt_avg'}, inplace=True)

In [404]:
# Add to array of years
new_li.append(df)

### 2010 - 2015

In [416]:
# load csv from 2010 - 2015 sheet
src = '../src/biomass/MPG vegetation biomass 2010-2018 - 2010-2015.csv'
df = pd.read_csv(src)

In [417]:
# reshape columns to long format
df = pd.melt(df
        , id_vars=['GridPt'])

In [418]:
# initialize year column with integer
df['year'] = 0

In [419]:
# populate seasons column
df.loc[df.variable.str.contains("S"), 'season'] = "spring"
df.loc[df.variable.str.contains("F"), 'season'] = "fall"

In [420]:
# populate year column
for x in range(10,16):
    df.loc[df.variable.str.contains(str(x)), 'year'] = 2000 + x

In [421]:
# change column to all float values
df['value'] = df['value'].apply(pd.to_numeric, errors='coerce')

In [422]:
# order columns
df = df[['GridPt', 'year', 'season', 'value']]

In [423]:
# average all GridPt samples by year
df = df.groupby(['GridPt'
            , 'year'
            , 'season'
            ]).mean()

# reshape and sort DataFrame
df = df.reset_index().sort_values(['year', 'GridPt'])

In [424]:
# format columns to match
df = df[['GridPt'
         , 'year'
         , 'season'
         , 'value']]
df.rename(columns={'GridPt': 'gridpt', 'value': 'pt_avg'}, inplace=True)

In [431]:
# Add to array of years
new_li.append(df)

### Concatenate DataFrame array
* combine sheets from all years

In [446]:
# combine all reshaped sheets
big_frame = pd.concat(new_li, ignore_index=True)

In [452]:
big_frame = big_frame.sort_values(['year', 'gridpt'])

# Output

In [460]:
output = '../output/biomass/MPG vegetation biomass 2010-2018_xyz.csv'
big_frame.to_csv(output, index=False)

In [622]:
output = '../output/biomass/MPG vegetation biomass 2010-2018_dropna.csv'
big_frame.dropna().to_csv(output, index=False)

# Explore all years

In [570]:
df = big_frame.copy()

### How many years of records?

In [575]:
print(len(df.year.unique()), 'years')

9 years


### Which years are included?

In [577]:
df.year.unique().tolist()

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

### How many nonnull records each season and year?

In [596]:
for x in range(2010,2019):
    print('***', str(x), '***')
    print('spring:', len(df[(df.year == x) & (df.season == 'spring')].dropna()))
    print('fall  :', len(df[(df.year == x) & (df.season == 'fall')].dropna()))
    print('total :', len(df[df.year == x].dropna()))
    print('\n')

*** 2010 ***
spring: 0
fall  : 227
total : 227


*** 2011 ***
spring: 69
fall  : 123
total : 192


*** 2012 ***
spring: 148
fall  : 148
total : 296


*** 2013 ***
spring: 128
fall  : 133
total : 261


*** 2014 ***
spring: 32
fall  : 63
total : 95


*** 2015 ***
spring: 114
fall  : 119
total : 233


*** 2016 ***
spring: 134
fall  : 133
total : 267


*** 2017 ***
spring: 135
fall  : 113
total : 248


*** 2018 ***
spring: 114
fall  : 47
total : 161




### Which points have sample data for the full history?
* select for Fall as 2010 does not contain spring data

In [610]:
# create an array of non-null points for each year
unique_pts = []
for x in range(2010,2019):
    unique_pts.append((df[(df.year == x) & (df.season == 'fall')].dropna()).gridpt.unique().tolist())

In [611]:
# find common points between all years
shared_pts = reduce(np.intersect1d, unique_pts)

In [618]:
shared_pts

array([ 71,  86,  87, 107, 108, 121, 189, 199, 225, 240, 241, 243, 245,
       304, 328])

In [619]:
len(shared_pts)

15

# Questions
* How many points were sampled each year?
* Do we need to find a set of points which were sampled every year?
* Does it makes sense to calculate only by season?