<img src="images/panorama-landscape-saguaro-national-park-arizona.jpg">

<img src="images/Dirty data.png">


# Preparation of saguaro data for growth rate analysis

### Import Saguaro Inventory dataset  obtained from Desert Botanical Garden
* Download requested from https://livingcollections.org/dbg/Home.aspx on 10/18/20
* Full history of inventory and measurement data for all saguaros alive and dead.csv

### Clean up and reformat data

### Derive data needed for subsequent analysis

### Export datasets to csv

In [35]:
#Importing necessary dependencies 

import os #module that provides functions for interacting with the operating system  

import pandas as pd #module that allows you to parse and manipulate a data set

import numpy as np #module that is used to process arrays 





#Function that converts inches/centimeters to feet 
def conversion(x):
    if x[3] == "inches":
        return round(float(x[2]) / 12,2)
    elif x[3] == "feet":
        return round(float(x[2]),2)
    elif x[3] =="centimeters": 
        return round(float(x[2]) / 30.4,2)
    

    

        

In [36]:
#importing primary Saguaro Data Set 
full_data = "../Original datafiles/Full history of inventory and measurement data for all saguaros alive and dead.csv"


csv_path = "../Original datafiles/85008_history.csv"
#converting file into a dataframe 
#The file had some special characters so I researched an alternative to utf-8 encoding 
#CP1252 is a windows encoding package that is designed to read through these special characters 
full_df = pd.read_csv(full_data,encoding='utf-8')

full_df


Unnamed: 0,Accession Planting,Inventory Date,Status,Condition,Measurement Date,Height,Height Unit,DBH,DBH Unit,Measurement Note
0,2020-0589-10-0,9/17/2020 0:00,Dormant,Good,,,,,,
1,2020-0589-10-0,9/18/2020 17:27,Dormant,Good,,,,,,
2,2020-0588-10-0,9/17/2020 0:00,Dormant,Good,,,,,,
3,2020-0588-10-0,9/18/2020 17:27,Dormant,Good,,,,,,
4,2020-0587-10-0,9/17/2020 0:00,Dormant,Good,,,,,,
...,...,...,...,...,...,...,...,...,...,...
13850,1947-1548-01-1,6/17/2009 0:00,Alive,,,,,,,
13851,1947-1548-01-1,7/9/2012 0:00,Alive,,,,,,,
13852,1947-1548-01-1,11/28/2012 12:16,Alive,,,,,,,
13853,1947-1548-01-1,12/17/2012 0:00,Dead,,,,,,,


In [37]:
#printing out info to see what what variable types I have in the data set 
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13855 entries, 0 to 13854
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Accession Planting  13855 non-null  object 
 1   Inventory Date      10410 non-null  object 
 2   Status              10410 non-null  object 
 3   Condition           8595 non-null   object 
 4   Measurement Date    3492 non-null   object 
 5   Height              3485 non-null   object 
 6   Height Unit         3445 non-null   object 
 7   DBH                 983 non-null    float64
 8   DBH Unit            3445 non-null   object 
 9   Measurement Note    3369 non-null   object 
dtypes: float64(1), object(9)
memory usage: 1.1+ MB


In [38]:
#Getting an detailed output of what measurement metrics were used 
full_df['Height Unit'].value_counts()

feet           2889
inches          476
centimeters      80
Name: Height Unit, dtype: int64

In [39]:
#I want to isolate the Measurement demographics for each saguaro, so I selected only the specific columns I want to work with 
meas_df = full_df[['Accession Planting','Measurement Date','Height','Height Unit','DBH','DBH Unit','Measurement Note']]

#Creating new data frame for the measurement information
meas_df = pd.DataFrame(meas_df)

#Displaying the data set
meas_df

Unnamed: 0,Accession Planting,Measurement Date,Height,Height Unit,DBH,DBH Unit,Measurement Note
0,2020-0589-10-0,,,,,,
1,2020-0589-10-0,,,,,,
2,2020-0588-10-0,,,,,,
3,2020-0588-10-0,,,,,,
4,2020-0587-10-0,,,,,,
...,...,...,...,...,...,...,...
13850,1947-1548-01-1,,,,,,
13851,1947-1548-01-1,,,,,,
13852,1947-1548-01-1,,,,,,
13853,1947-1548-01-1,,,,,,


In [40]:
#Printing the values out in Measurment Date

#Reason for doing this was because the str.extract function returned an string error. 

#This was due to special characters nested throughout the columns index. 
u = meas_df["Measurement Date"]

#Feel free to put in the print statement to watch the code run. I removed it from the script so that would 
#remove the need to exxcessively scroll through the notebook. 
for x in u: 
    u

In [41]:
#Once the cell above was ran, I found that the following special strings and characters were appended to the columns inex. 

#To remove the values I set the data frame = to a function that did not contain those values

#This returned cleaned up date values for the Measurement Date column 
meas_df = meas_df[meas_df['Measurement Date'] != " dl\""]

meas_df = meas_df[meas_df['Measurement Date'] != " VN"]

meas_df = meas_df[meas_df['Measurement Date'] != " but it has an old tag as \"199202140110\". dl\""]

meas_df = meas_df[meas_df['Measurement Date'] != "nan"]

#Displaying the data set 
meas_df

Unnamed: 0,Accession Planting,Measurement Date,Height,Height Unit,DBH,DBH Unit,Measurement Note
0,2020-0589-10-0,,,,,,
1,2020-0589-10-0,,,,,,
2,2020-0588-10-0,,,,,,
3,2020-0588-10-0,,,,,,
4,2020-0587-10-0,,,,,,
...,...,...,...,...,...,...,...
13850,1947-1548-01-1,,,,,,
13851,1947-1548-01-1,,,,,,
13852,1947-1548-01-1,,,,,,
13853,1947-1548-01-1,,,,,,


In [42]:
#I had trouble trying to find the right approach to extracting specific data from a column

#Zach helped out with this one, and introduced me to a snippet of a regular expression text extraction. 

#This let extract the Measurement Year of each Saguaro ID. 

#regex is looking for anything that starts with 20 then \d\d

#looking for a four digit that starts with 20 
meas_df['Measurement Year']=meas_df['Measurement Date'].str.extract(r'(20\d\d)')

meas_df

Unnamed: 0,Accession Planting,Measurement Date,Height,Height Unit,DBH,DBH Unit,Measurement Note,Measurement Year
0,2020-0589-10-0,,,,,,,
1,2020-0589-10-0,,,,,,,
2,2020-0588-10-0,,,,,,,
3,2020-0588-10-0,,,,,,,
4,2020-0587-10-0,,,,,,,
...,...,...,...,...,...,...,...,...
13850,1947-1548-01-1,,,,,,,
13851,1947-1548-01-1,,,,,,,
13852,1947-1548-01-1,,,,,,,
13853,1947-1548-01-1,,,,,,,


In [43]:
#Renaming my Accession planting column to Saguaro ID 
meas_df = meas_df.rename(columns={'Accession Planting':'Saguaro ID'})

In [44]:
#Dropping unwanted columns from the data frame
meas_df = meas_df.drop(columns=['DBH','DBH Unit'])

In [45]:
#Getting value counts for measurements that were taken for each year. 

#We found that not all Saguaros received measurements on a consistent year to year basis, so the group decided 

#to only take into account years with the highest measurement counts. 

#Another reason for this, is in order for the data to be statistically significant we needed to have a saguaros measurement record 

#for all years that we are using in the data set. 
meas_df['Measurement Year'].value_counts()

2020    1072
2015     991
2013     617
2012     344
2019     189
2018     133
2014      47
2017      44
2003       2
2016       1
2002       1
Name: Measurement Year, dtype: int64

In [46]:
#Changing the Height variable type to a float and checking it's dtype. 

#You can't run functions on strings, so I had to make sure it was a float variable. 
meas_df['Height'] =meas_df['Height'].astype(float)

#Making sure the dtype changed to a float variable 
meas_df.dtypes

Saguaro ID           object
Measurement Date     object
Height              float64
Height Unit          object
Measurement Note     object
Measurement Year     object
dtype: object

In [47]:
#Only displaying information in the data set that was recorded after 2012 
meas_df = meas_df.loc[meas_df['Measurement Year'] > "2012"]

meas_df

Unnamed: 0,Saguaro ID,Measurement Date,Height,Height Unit,Measurement Note,Measurement Year
6,2020-0150-01-8,9/3/2020 0:00,2.50,inches,,2020
9,2020-0150-01-7,3/27/2020 0:00,5.00,inches,,2020
12,2020-0150-01-6,2/26/2020 13:45,0.90,feet,"Measured height during Saguaro Inventory 2020,...",2020
13,2020-0150-01-6,2/19/2020 0:00,10.00,inches,Saguaro inventory 2020,2020
16,2020-0150-01-5,2/26/2020 13:45,0.20,feet,"Measured height during Saguaro Inventory 2020,...",2020
...,...,...,...,...,...,...
13809,1966-8304-01-1,2/25/2015 0:00,28.95,feet,2 arms. 0 buds.,2015
13810,1966-8304-01-1,3/30/2013 15:31,28.15,feet,4 arms. ahp,2013
13831,1966-8295-01-1,2/26/2020 13:53,20.30,feet,"Measured height during Saguaro Inventory 2020,...",2020
13832,1966-8295-01-1,2/23/2015 0:00,19.70,feet,5 arms. 1 bud.,2015


In [48]:
#Calling out the Function Definition that was shown at the beginnning of the script

#It locates and identifies if the string value in the Height Unit columns displays "feet","inches", or "centimeters". 

#Depending on the string value, it converts by divided the value by 12 for inches, dividing by 30.4 for centimeters, 

#and does nothing for strings values labeled feet. 
meas_df['Units in Feet'] = meas_df.apply(conversion,axis=1)

meas_df

Unnamed: 0,Saguaro ID,Measurement Date,Height,Height Unit,Measurement Note,Measurement Year,Units in Feet
6,2020-0150-01-8,9/3/2020 0:00,2.50,inches,,2020,0.21
9,2020-0150-01-7,3/27/2020 0:00,5.00,inches,,2020,0.42
12,2020-0150-01-6,2/26/2020 13:45,0.90,feet,"Measured height during Saguaro Inventory 2020,...",2020,0.90
13,2020-0150-01-6,2/19/2020 0:00,10.00,inches,Saguaro inventory 2020,2020,0.83
16,2020-0150-01-5,2/26/2020 13:45,0.20,feet,"Measured height during Saguaro Inventory 2020,...",2020,0.20
...,...,...,...,...,...,...,...
13809,1966-8304-01-1,2/25/2015 0:00,28.95,feet,2 arms. 0 buds.,2015,28.95
13810,1966-8304-01-1,3/30/2013 15:31,28.15,feet,4 arms. ahp,2013,28.15
13831,1966-8295-01-1,2/26/2020 13:53,20.30,feet,"Measured height during Saguaro Inventory 2020,...",2020,20.30
13832,1966-8295-01-1,2/23/2015 0:00,19.70,feet,5 arms. 1 bud.,2015,19.70


In [49]:
#Creating new columns for measurement units in feet and appending them to their corresponding year
growth_2013 = meas_df.loc[meas_df['Measurement Year'] == "2013",'Units in Feet']

growth_2015 = meas_df.loc[meas_df['Measurement Year'] == "2015",'Units in Feet']

growth_2020 = meas_df.loc[meas_df['Measurement Year'] == "2020",'Units in Feet']

#Creating columns for these values 
meas_df['2013 Measurement'] = growth_2013

meas_df['2015 Measurement'] = growth_2015

meas_df['2020 Measurement'] = growth_2020

#Placing the newly created columns into the dataframe 
meas_df = pd.DataFrame(meas_df)

#displaying the dataframe 
meas_df

Unnamed: 0,Saguaro ID,Measurement Date,Height,Height Unit,Measurement Note,Measurement Year,Units in Feet,2013 Measurement,2015 Measurement,2020 Measurement
6,2020-0150-01-8,9/3/2020 0:00,2.50,inches,,2020,0.21,,,0.21
9,2020-0150-01-7,3/27/2020 0:00,5.00,inches,,2020,0.42,,,0.42
12,2020-0150-01-6,2/26/2020 13:45,0.90,feet,"Measured height during Saguaro Inventory 2020,...",2020,0.90,,,0.90
13,2020-0150-01-6,2/19/2020 0:00,10.00,inches,Saguaro inventory 2020,2020,0.83,,,0.83
16,2020-0150-01-5,2/26/2020 13:45,0.20,feet,"Measured height during Saguaro Inventory 2020,...",2020,0.20,,,0.20
...,...,...,...,...,...,...,...,...,...,...
13809,1966-8304-01-1,2/25/2015 0:00,28.95,feet,2 arms. 0 buds.,2015,28.95,,28.95,
13810,1966-8304-01-1,3/30/2013 15:31,28.15,feet,4 arms. ahp,2013,28.15,28.15,,
13831,1966-8295-01-1,2/26/2020 13:53,20.30,feet,"Measured height during Saguaro Inventory 2020,...",2020,20.30,,,20.30
13832,1966-8295-01-1,2/23/2015 0:00,19.70,feet,5 arms. 1 bud.,2015,19.70,,19.70,


In [50]:
#In order to filter the data set by year, I placed the desired years into a list and used the isin function. 
filter_rec = ["2013","2015","2020"]

#using the isin list function to filter out the data that is in the list created above 
filter_year = meas_df[meas_df['Measurement Year'].isin(filter_rec)]

filter_year = pd.DataFrame(filter_year)

#sorting the values by Saguaro ID 
filter_year.sort_values('Saguaro ID',ascending=False)



Unnamed: 0,Saguaro ID,Measurement Date,Height,Height Unit,Measurement Note,Measurement Year,Units in Feet,2013 Measurement,2015 Measurement,2020 Measurement
6,2020-0150-01-8,9/3/2020 0:00,2.50,inches,,2020,0.21,,,0.21
9,2020-0150-01-7,3/27/2020 0:00,5.00,inches,,2020,0.42,,,0.42
12,2020-0150-01-6,2/26/2020 13:45,0.90,feet,"Measured height during Saguaro Inventory 2020,...",2020,0.90,,,0.90
13,2020-0150-01-6,2/19/2020 0:00,10.00,inches,Saguaro inventory 2020,2020,0.83,,,0.83
16,2020-0150-01-5,2/26/2020 13:45,0.20,feet,"Measured height during Saguaro Inventory 2020,...",2020,0.20,,,0.20
...,...,...,...,...,...,...,...,...,...,...
13790,1970-1021-01-1,1/9/2013 16:43,19.90,feet,6 arms. 2 buds. sg&cs,2013,19.90,19.90,,
13809,1966-8304-01-1,2/25/2015 0:00,28.95,feet,2 arms. 0 buds.,2015,28.95,,28.95,
13810,1966-8304-01-1,3/30/2013 15:31,28.15,feet,4 arms. ahp,2013,28.15,28.15,,
13831,1966-8295-01-1,2/26/2020 13:53,20.30,feet,"Measured height during Saguaro Inventory 2020,...",2020,20.30,,,20.30


In [51]:
#This part was a little tedious. In order to not lose any values within the data set, I had to filter the Saguaro ID by measurement 

#year. Since there was a chance that the Saguaro was measured twice in the same year, I used the last() function, so the data 

#set will only contain the most recent measurement of the Saguaro ID in that year. 
dup_list1 = ["2013"]

filt_dup1 = filter_year[filter_year['Measurement Year'].isin(dup_list1)]

filt_dup1.sort_values('Saguaro ID',ascending=False)

filt_dup1_group = filt_dup1.groupby(['Saguaro ID','Measurement Year']).last()['2013 Measurement']

#I am resetting the index so that they all reside on the same level, so the values are easier to manipulate afterward. 
filt_dup1_group = filt_dup1_group.reset_index()

filt_dup1_group = pd.DataFrame(filt_dup1_group)

filt_dup1_group

Unnamed: 0,Saguaro ID,Measurement Year,2013 Measurement
0,1966-8304-01-1,2013,28.15
1,1970-1021-01-1,2013,19.90
2,1970-9909-01-1,2013,23.70
3,1971-0130-01-1,2013,6.10
4,1972-0180-01-1,2013,5.80
...,...,...,...
606,2013-0266-01-6,2013,4.80
607,2013-0266-01-7,2013,5.70
608,2013-0266-01-8,2013,4.10
609,2013-0266-01-9,2013,17.90


In [52]:
#Same instance but for 2015 
dup_list2 = ["2015"]

filt_dup2 = filter_year[filter_year['Measurement Year'].isin(dup_list2)]

filt_dup2.sort_values('Saguaro ID',ascending=False)

filt_dup2_group = filt_dup2.groupby(['Saguaro ID','Measurement Year']).last()['2015 Measurement']

filt_dup2_group = filt_dup2_group.reset_index()

filt_dup2_group = pd.DataFrame(filt_dup2_group)

filt_dup2_group

Unnamed: 0,Saguaro ID,Measurement Year,2015 Measurement
0,1966-8295-01-1,2015,19.70
1,1966-8304-01-1,2015,28.95
2,1970-1021-01-1,2015,21.50
3,1970-9909-01-1,2015,24.95
4,1971-0130-01-1,2015,6.90
...,...,...,...
981,2015-0151-01-9,2015,0.67
982,2015-0152-01-1,2015,8.60
983,2015-0152-01-2,2015,5.71
984,2015-0152-01-3,2015,1.92


In [53]:
#Same instance but for 2020 
dup_list3 = ["2020"]

filt_dup3 = filter_year[filter_year['Measurement Year'].isin(dup_list3)]

filt_dup3.sort_values('Saguaro ID',ascending=False)

filt_dup3_group = filt_dup3.groupby(['Saguaro ID','Measurement Year']).last()['2020 Measurement']

filt_dup3_group = filt_dup3_group.reset_index()

filt_dup3_group = pd.DataFrame(filt_dup3_group)

filt_dup3_group

Unnamed: 0,Saguaro ID,Measurement Year,2020 Measurement
0,1966-8295-01-1,2020,20.30
1,1970-1021-01-1,2020,22.30
2,1970-9909-01-1,2020,25.10
3,1971-0130-01-1,2020,8.60
4,1976-0079-01-1,2020,17.50
...,...,...,...
1061,2020-0150-01-4,2020,0.17
1062,2020-0150-01-5,2020,0.20
1063,2020-0150-01-6,2020,0.83
1064,2020-0150-01-7,2020,0.42


In [54]:
#using the clean dataframe without duplicate ID measurements from the same year 
clean_list = [filt_dup1_group, filt_dup2_group, filt_dup3_group]


#concatenating these frames 
clean_frame = pd.concat(clean_list)

clean_frame


Unnamed: 0,Saguaro ID,Measurement Year,2013 Measurement,2015 Measurement,2020 Measurement
0,1966-8304-01-1,2013,28.15,,
1,1970-1021-01-1,2013,19.90,,
2,1970-9909-01-1,2013,23.70,,
3,1971-0130-01-1,2013,6.10,,
4,1972-0180-01-1,2013,5.80,,
...,...,...,...,...,...
1061,2020-0150-01-4,2020,,,0.17
1062,2020-0150-01-5,2020,,,0.20
1063,2020-0150-01-6,2020,,,0.83
1064,2020-0150-01-7,2020,,,0.42


In [55]:
#Grouping by the measurement year and grabbing the sum so that the groupby method is initiated

#no values were altered by this. In the cell above we can see that Saguaro ID 1966-8304-01-1 shows only one measurement 

#for 2013, while 2015 and 2020 are left with unkown variables. 

#The concatenated function stacks the values on top of one anther by column name so it didn't place the Saguaro ID values in the 

#same row. 
clean_group = clean_frame.groupby('Saguaro ID')[['2013 Measurement','2015 Measurement','2020 Measurement']].sum()

clean_group = clean_group.reset_index()

clean_group_df = pd.DataFrame(clean_group)

clean_group_df

Unnamed: 0,Saguaro ID,2013 Measurement,2015 Measurement,2020 Measurement
0,1966-8295-01-1,0.00,19.70,20.30
1,1966-8304-01-1,28.15,28.95,0.00
2,1970-1021-01-1,19.90,21.50,22.30
3,1970-9909-01-1,23.70,24.95,25.10
4,1971-0130-01-1,6.10,6.90,8.60
...,...,...,...,...
1161,2020-0150-01-4,0.00,0.00,0.17
1162,2020-0150-01-5,0.00,0.00,0.20
1163,2020-0150-01-6,0.00,0.00,0.83
1164,2020-0150-01-7,0.00,0.00,0.42


In [56]:
#removing any Saguaor that was not measured all three years, or had a zero value for any column in the data set 
timespan = clean_group_df[(clean_group_df['2013 Measurement'] != 0) & (clean_group_df['2015 Measurement'] != 0) & (clean_group['2020 Measurement'] !=0)]

timespan = pd.DataFrame(timespan)

timespan.to_csv("../Data ready for analyses/Measurements_2013_2015_2020.csv")

timespan

Unnamed: 0,Saguaro ID,2013 Measurement,2015 Measurement,2020 Measurement
2,1970-1021-01-1,19.9,21.50,22.3
3,1970-9909-01-1,23.7,24.95,25.1
4,1971-0130-01-1,6.1,6.90,8.6
6,1976-0079-01-1,16.8,16.90,17.5
7,1976-0235-01-1,13.7,14.00,14.7
...,...,...,...,...
955,2013-0266-01-4,17.8,17.80,18.3
956,2013-0266-01-5,5.0,5.00,5.9
957,2013-0266-01-6,4.8,5.50,7.2
958,2013-0266-01-7,5.7,5.70,6.3


In [57]:
# Checking for outliers.
print("2013 max height: " + str(timespan["2013 Measurement"].max()))
print("2015 max height: " + str(timespan["2015 Measurement"].max()))
print("2020 max height: " + str(timespan["2020 Measurement"].max()) + " Outlier!")
print("2013 min height: " + str(timespan["2013 Measurement"].min()))
print("2015 min height: " + str(timespan["2015 Measurement"].min()))
print("2020 min height: " + str(timespan["2020 Measurement"].min()))

2013 max height: 25.9
2015 max height: 25.95
2020 max height: 159.0 Outlier!
2013 min height: 0.25
2015 min height: 0.38
2020 min height: 0.6


In [58]:
# Find the accession number of the saguaro with the outlier measurement.

timespan.loc[timespan["2020 Measurement"] == 159]

Unnamed: 0,Saguaro ID,2013 Measurement,2015 Measurement,2020 Measurement
692,2012-0313-01-13,14.7,15.2,159.0


In [63]:
# After reviewing the photo of saguaro #2012-0313-01-13 in livingcollections.org, the saguaro is clearly between 10-20 feet tall.
# Measurements in 2020 were all taken in feet, therefore we assume there is a typo in the measurement and it should be 15.9 feet.
# Fixing the height typo.

timespan.at['2020 Measurement', '742'] = 15.9

timespan.loc[timespan["2020 Measurement"] == 15.9]

Unnamed: 0,Saguaro ID,2013 Measurement,2015 Measurement,2020 Measurement,742
742,2012-0314-01-13,13.5,15.0,15.9,


In [64]:
# Checking for outliers again, looks good this time.
print("2013 max height: " + str(timespan["2013 Measurement"].max()))
print("2015 max height: " + str(timespan["2015 Measurement"].max()))
print("2020 max height: " + str(timespan["2020 Measurement"].max()))
print("2013 min height: " + str(timespan["2013 Measurement"].min()))
print("2015 min height: " + str(timespan["2015 Measurement"].min()))
print("2020 min height: " + str(timespan["2020 Measurement"].min()))

2013 max height: 25.9
2015 max height: 25.95
2020 max height: 159.0
2013 min height: 0.25
2015 min height: 0.38
2020 min height: 0.6


In [23]:
#Calculating the average growth rate for the Saguaros obtained after the data cleaning 
clean_group_df1 = timespan

#Set variables equal to each column, and proceeded to run a simple calculation on them 
thirteen = timespan['2013 Measurement']

fifteen = timespan['2015 Measurement']

twenty = timespan['2020 Measurement']

clean_group_df1['2013 - 2015 Avg. Growth Rate'] = (fifteen - thirteen) / 2

clean_group_df1['2015 - 2020 Avg. Growth Rate'] = (twenty - fifteen) / 5

clean_group_df1

Unnamed: 0,Saguaro ID,2013 Measurement,2015 Measurement,2020 Measurement,2013 - 2015 Avg. Growth Rate,2015 - 2020 Avg. Growth Rate
2,1970-1021-01-1,19.9,21.50,22.3,0.800,0.16
3,1970-9909-01-1,23.7,24.95,25.1,0.625,0.03
4,1971-0130-01-1,6.1,6.90,8.6,0.400,0.34
6,1976-0079-01-1,16.8,16.90,17.5,0.050,0.12
7,1976-0235-01-1,13.7,14.00,14.7,0.150,0.14
...,...,...,...,...,...,...
955,2013-0266-01-4,17.8,17.80,18.3,0.000,0.10
956,2013-0266-01-5,5.0,5.00,5.9,0.000,0.18
957,2013-0266-01-6,4.8,5.50,7.2,0.350,0.34
958,2013-0266-01-7,5.7,5.70,6.3,0.000,0.12


In [31]:
#Getting the aggregated growth rate of all Saguaros for each timespan. 
timespan1 = round(clean_group_df1['2013 - 2015 Avg. Growth Rate'].sum() / 525,3)


timespan2 = round(clean_group_df1['2015 - 2020 Avg. Growth Rate'].sum() / 525,3)

print(timespan2, timespan1)



0.294 0.322


In [32]:
#Getting the statistical values of mean, median, var, and standard deviation for each year. 
stats_2013 = filt_dup1_group.groupby(['Measurement Year']).agg({"2013 Measurement":
                                                              [np.mean,
                                                              np.median,
                                                              np.var,
                                                              np.std]})

stats_2013.columns = stats_2013.columns.droplevel()

stats_2013 = stats_2013.reset_index()

stats_2013


Unnamed: 0,Measurement Year,mean,median,var,std
0,2013,11.159525,11.95,42.509368,6.519921


In [33]:
#Same instance for growth rates statistics in 2015 
stats_2015 = filt_dup2_group.groupby(['Measurement Year']).agg({"2015 Measurement":
                                                              [np.mean,
                                                              np.median,
                                                              np.var,
                                                              np.std]})

stats_2015.columns = stats_2015.columns.droplevel()

stats_2015 = stats_2015.reset_index()

stats_2015

Unnamed: 0,Measurement Year,mean,median,var,std
0,2015,11.61069,13.0,48.084281,6.934283


In [34]:
#Same instance for growth rates statistics in 2020
stats_2020 = filt_dup3_group.groupby(['Measurement Year']).agg({"2020 Measurement":
                                                              [np.mean,
                                                              np.median,
                                                              np.var,
                                                              np.std]})

stats_2020.columns = stats_2020.columns.droplevel()

stats_2020 = stats_2020.reset_index()

stats_2020

Unnamed: 0,Measurement Year,mean,median,var,std
0,2020,11.572336,12.7,73.757426,8.588214


In [35]:
#using the clean dataframe without duplicate ID measurements from the same year 
stat_list = [stats_2013, stats_2015, stats_2020]


#concatenating these frames 
stat_frame = pd.concat(stat_list)

stat_frame

Unnamed: 0,Measurement Year,mean,median,var,std
0,2013,11.159525,11.95,42.509368,6.519921
0,2015,11.61069,13.0,48.084281,6.934283
0,2020,11.572336,12.7,73.757426,8.588214


In [36]:
#Getting the average growth rate into a data frame, ,and will also be used as a csv export 
growth_rt = {'Timespan':['2013 - 2015','2015 - 2020'],'Average Yearly Growth Rate':[timespan1, timespan2]}

growth_rt = pd.DataFrame(growth_rt)

growth_rt

Unnamed: 0,Timespan,Average Yearly Growth Rate
0,2013 - 2015,0.322
1,2015 - 2020,0.294


In [37]:
#exporting growth rates to csv 
Growth_Rates = growth_rt.to_csv("../Data ready for analyses/GrowthRates_ready_for_analysis.csv",encoding='utf-8')


In [38]:
#exporting timespan measurements to csv 
Measurements = clean_group_df1.to_csv("../Data ready for analyses/Mortality_per_timespan_ready_for_analysis.csv",encoding='utf-8')