# Temp and Humidity Data Preparation
The following need to be done:
1. Set up the environment
2. Load the relevant data sets from file
 * `NestCharacteristic-Static.csv   -> df_nest_static`
 * `NestCharacteristic-Seasonal.csv -> df_nest_seasonal`
 * `BreedingDataCombined.csv        -> df_breeding`
 * `TempData_2_10_2016.txt          -> df_temp`
 * `HumidData_2_10_2016.txt         -> df_humd`
3. Join them into a single data set
4. Add additional computed features to the data
5. Write the prepared data to file

## 1. Set up the environment
### 1.1 Import the required libraries
We need a certain set of common libraries for the tasks to be performed. These are imported below. If an import statement errors, you will need to install the library in your environment using the command line command `pip install <library>`.

In [1]:
print('Setting up environment and variables.', flush=True)
import pandas as pd
import os
import numpy as np
import datetime
import time

Setting up environment and variables.


### 1.2 Set up the variables
You will need to change the values of the variables below to suit the names and directory location of your files to be loaded.

In [80]:
# update these with your file paths
nest_static_file = os.path.normpath('../0_data/NestCharacteristic-Static.csv')
nest_seasonal_file = os.path.normpath('../0_data/NestCharacteristic-Seasonal.csv')
breeding_data_file = os.path.normpath('../0_data/BreedingDataCombined.csv')
temperature_file = os.path.normpath('../0_data/TempData_2_10_2016.txt')
humidity_file = os.path.normpath('../0_data/HumidData_2_10_2016.txt')
joined_data_file = os.path.normpath('TempHumdCombined.csv')

# write intermediate tables to disk for debugging purposes
write_temps = True
df_sensor_data = None

### 1.3 Define helper functions

In [54]:
def read_file_handler_start(file_path, file_description):
    '''
    Prints the user-friendly messages at the start of reading a data file to csv:
        '<timestamp> Loading the <file_description> data file.'
        '<timestamp> <file_description> file is <file size> MB.'
        '<timestamp> Loading into memory. Please be patient.'
    Variables:
        file_path: the location of the file to be read. Requires an os.path.normpath object.
        file_description: String description of what the file is / name of the file.
    '''
    print('{0} - Loading the {1} data file.'.format(str(time.ctime()), file_description), flush=True)

    file_size = os.path.getsize(file_path)
    print('{0} - {1} file is {2:.3f} MB.'.format(str(time.ctime()), file_description, (file_size/1000000)))

    if file_size > 5000000: # over 5mb
        print('{0} - Loading into memory. Please be patient.'.format(str(time.ctime())), flush=True)
    else:
        print('{0} - Loading into memory.'.format(str(time.ctime())), flush=True)

def read_file_handler_end(file_path, file_description, df, df_var_name):
    '''
    Prints the user-friendly messages at the end of reading a data file to csv:
        '<timestamp> Success: loaded <number of records> records.'
        On fail: 
            '<timestamp> ### FAILED! ###'
            '<timestamp> <df_var_name> was not created. Exiting.'
            Exits the script.
    Variables:
        file_path: the location of the file to be read. Requires an os.path.normpath object.
        file_description: String description of what the file is / name of the file.
        df: a Pandas dataframe that should have resulted from the read_csv() call.
        df_var_name: the variable name for the dataframe as a string.
    '''
    if df is not None:
        print('{0} - Success: loaded {1:,} records.'.format(str(time.ctime()), len(df)))
    else:
        print('{0} - ### FAILED! ###'.format(str(time.ctime())))
        print('{0} - {1} was not created. Exiting.'.format(str(time.ctime())), df_var_name)
        sys.exit(0)

def write_temp_file(df, filepath, df_name):
    '''
    If write_temps is true, this function will write the specified Pandas dataframe (df) to csv at the specified location (filepath).
    Variables:
        df: a Pandas dataframe to be written to csv.
        filepath: a string in Unix path format (using / not \) for the csv destination.
        df_name: human readable name or description of the dataframe for logging purposes.
    '''
    if write_temps:
        print('{0} - Writing intermediate table {1} to disk.'.format(str(time.ctime()), df_name, filepath), flush=True)
        df.to_csv(os.path.normpath(filepath))
        if os.path.getsize(filepath) > 0:
            print('{0} - Written {1}: {2:.3f} MB'.format(str(time.ctime()), filepath, os.path.getsize(filepath)/1000000), flush=True)

## 2. Load the data from file
### 2.1.1 Read in the NestCharacteristic-Static data (df_nest_static)
This is the real nest master data to which everything else is joined. Refer to the GitHub Wiki for descriptions of the data fields.

In [39]:
read_file_handler_start(nest_static_file, 'Nest Characteristic (Static)')
data_types = {'nest_id': str,
              'nest_type': str,
              'distance_to_boardwalk_m': np.float32,
              'distance_to_vegetation_m': np.float32,
              'distance_to_landfall': np.float32,
              'entrance_bearing': np.float32,
              'box_height_mm': np.float32,
              'box_length_mm': np.float32,
              'box_width_mm': np.float32,
              'box_wall_width_mm': np.float32,
              'box_lid_depth': np.float32,
              'internal_height_mm': np.float32,
              'internal_width_mm': np.float32,
              'internal_length_mm': np.float32,
              'entrance_height': np.float32,
              'entrance_width': np.float32,
              'entrance_length': np.float32,
              'vents': np.float32,
              'box_has_tunnel': np.float32,
              'shape': str,
              'elevation': np.float32,
              'easting': np.float32,
              'northing': np.float32,
              'aspect': np.float32,
              'slope': np.float32,
              'duration_of_insolation': np.float32,
              'comment': str}
df_nest_static = pd.read_csv(nest_static_file, 
                             header=0,
                             dtype=data_types,
                             encoding='utf-8',
                             error_bad_lines=True,
                             warn_bad_lines=True)
read_file_handler_end(nest_static_file, 'Nest Characteristic (Static)', df_nest_static, 'df_nest_static')

Tue Jan 24 10:39:04 2017 - Loading the Nest Characteristic (Static) data file.
Tue Jan 24 10:39:04 2017 - Nest Characteristic (Static) file is 0.032 MB.
Tue Jan 24 10:39:04 2017 - Loading into memory.
Tue Jan 24 10:39:04 2017 - Success: loaded 241 records.


### 2.1.1 Update and cleanse fields in NestCharacteristic-Static data (df_nest_static)
* Make sure all the nest IDs are uppercase and trimmed
* Create field `box_vol_L`
* Create field `box_area_cm2`

In [16]:
# make sure the nest IDs are all caps
df_nest_static['nest_id'] = df_nest_static['nest_id'].apply(lambda x: x.strip()).apply(lambda x: x.upper())

# calc the volume
# some boxes have only external measurements, not internal (which we are trying to calc). If it has external
# but not internal, then use external measurements
def box_vol_L(row):
    if not (np.isnan(row['internal_length_mm']) or 
            np.isnan(row['internal_width_mm']) or 
            np.isnan(row['internal_height_mm'])
           ):
        # has all internal measurements
        result = row['internal_length_mm'] * row['internal_width_mm'] * row['internal_height_mm'] / 1000000
    else:
        # has >0 missing internal measurements, so we'll use the external measurements.
        # if we don't have all of those we'll get a NaN anyway.
        result = row['box_length_mm'] * row['box_width_mm'] * row['box_height_mm'] / 1000000
    return result

df_nest_static['box_vol_L'] = df_nest_static.apply(box_vol_L, axis=1)
        
# calc the floor area
def box_area_cm2(row):
    if not (np.isnan(row['internal_length_mm']) or np.isnan(row['internal_width_mm'])):
        # have all internal measurements, so use them
        result = row['internal_length_mm'] * row['internal_width_mm'] / 100
    else:
        # hope we have all external measurements, else we're getting a Nan anyway
        result = row['box_length_mm'] * row['box_width_mm'] / 100
    return result

df_nest_static['box_area_cm2'] = df_nest_static.apply(box_area_cm2, axis=1)

print(str(time.ctime()), 'df_nest_static prepared successfully.\n')

Tue Jan 24 09:34:38 2017 df_nest_static prepared successfully.



### 2.2.1 Read in the NestCharacteristic-Seasonal data (as df_nest_seasonal)
Recorded for old boxes and natural nests. Contains seasonal observations of nest vegetation and cover.
New boxes (not recorded) were an experiment in different building methods and their effect on box temperature. 

In [40]:
read_file_handler_start(nest_seasonal_file, 'Nest Characteristic (Seasonal)')

data_types = {'type': str,
              'nest_id': str,
              'BoxSeasYear': str,
              'date': str,
              'year': str,
              'season': str,
              'BoxCoverTotal': np.float32,
              'BoxCoverDead': np.float32,
              'BoxWood': np.float32,
              'BoxWoodDead': np.float32,
              'BoxVeg': np.float32,
              'BoxVegDead': np.float32,
              'QuadCoverTotal': np.float32,
              'QuadCoverDead': np.float32,
              'QuadWood': np.float32,
              'QuadWoodDead': np.float32,
              'QuadVeg': np.float32,
              'QuadVegDead': np.float32,
              # placeholder for the NestDomeCover field that is meant to exist but currently does not
              'comments': str
             }
df_nest_seasonal = pd.read_csv(nest_seasonal_file,
                               header=0,
                               dtype=data_types,
                               encoding='utf-8',
                               parse_dates=['date'],
                               dayfirst=True,
                               error_bad_lines=True,
                               warn_bad_lines=True)

read_file_handler_end(nest_seasonal_file, 'Nest Characteristic (Seasonal)', df_nest_seasonal, 'df_nest_seasonal')

Tue Jan 24 10:40:32 2017 - Loading the Nest Characteristic (Seasonal) data file.
Tue Jan 24 10:40:32 2017 - Nest Characteristic (Seasonal) file is 0.108 MB.
Tue Jan 24 10:40:32 2017 - Loading into memory.
Tue Jan 24 10:40:32 2017 - Success: loaded 1,711 records.


### 2.2.2 Update and cleanse fields in the NestCharacteristic-Seasonal data (df_nest_seasonal)
* Nest IDs to be all uppercase and trimmed
* recalculate the `year` and `season`
* create the unique ID `BoxSeasYear`

In [133]:
# make sure the nest IDs are all caps
df_nest_seasonal['nest_id'] = df_nest_seasonal['nest_id'].apply(lambda x: x.strip()).apply(lambda x: x.upper())

# recalculate year (because was manually created)
df_nest_seasonal['year'] = df_nest_seasonal['date'].apply(lambda x: x.year)

# recalculate season (because was manually created)
def season(date):
    if date.month >= 3 and date.month <= 5:
        return 'AUTUMN'
    elif date.month >= 6 and date.month <= 8:
        return 'WINTER'
    elif date.month >= 9 and date.month <= 11:
        return 'SPRING'
    elif (date.month >= 1 and date.month <= 2) or date.month == 12:
        return 'SUMMER'
    else:
        return None
    
df_nest_seasonal['season'] = df_nest_seasonal['date'].apply(lambda x: season(x))

# calc the unique ID
df_nest_seasonal['BoxSeasYear'] = df_nest_seasonal['nest_id'] + df_nest_seasonal['season'] + df_nest_seasonal['year'].apply(lambda x: str(x))

### 2.3.1 Read in the BreedingDataCombined file (as df_breeding)

In [136]:
read_file_handler_start(breeding_data_file, 'Breeding')
data_types = {'nest_id': str,
              'observation_date': str,
              'ActivityStatus': np.float32,
              'adult': np.float32,
              'clutch': np.float32,
              'eggs': np.float32,
              'ChicksAlive': np.float32,
              'ChicksDead': np.float32,
              'ChicksAge': np.float32,
              'ChicksFledge': np.float32,
              'ChicksMissing': np.float32,
              'ContentsNotVisible': np.float32,
              'EggLayDate': str,
              'IDChick1': np.float32,
              'MassChick1': np.float32,
              'IDChick2': np.float32,
              'MassChick2': np.float32,
              'comments': str
             }
df_breeding = pd.read_csv(breeding_data_file,
                          header=0, 
                          dtype=data_types,
                          encoding='utf-8',
                          parse_dates=['observation_date', 'EggLayDate'],
                          dayfirst=True,
                          error_bad_lines=True,
                          warn_bad_lines=True)
read_file_handler_end(breeding_data_file, 'Breeding', df_breeding, 'df_breeding')

Wed Jan 25 13:27:45 2017 - Loading the Breeding data file.
Wed Jan 25 13:27:45 2017 - Breeding file is 0.137 MB.
Wed Jan 25 13:27:45 2017 - Loading into memory.
Wed Jan 25 13:27:45 2017 - Success: loaded 3,575 records.


### 2.3.2 Update and cleanse fields in the Breeding data (df_breeding)
* Nest IDs to be all uppercase and trimmed
* `year` is year of `observation_date`

In [137]:
# make sure the nest IDs are all caps
df_breeding['nest_id'] = df_breeding['nest_id'].apply(lambda x: x.strip()).apply(lambda x: x.upper())

# create year field
df_breeding['year'] = df_breeding['observation_date'].apply(lambda x: x.year)

### 2.3.3 Aggregate the Breeding data to get annual stats
* **nest_id**
* **year**
* **clutch**
* clutch_count
* egg_count
* chick_count
* fletch_count
* lay_date
* age_at_fletching
* mass_at_fletching_chick1
* mass_at_fletching_chick2
* chick_id1
* chick_id2

Add field:
* `flag_activity_status`: True iff max(ActivityStatus) in year > 0. Note that ActivityStatus was not recorded for the numeric nest_ids, so this field should not be used for 'usage'.

In [138]:
print('{0} - Aggregating breeding data to get annual stats.'.format(str(time.ctime())), flush=True)

# get the clutches per nest and year
# [[chosen columns]] -> groupby -> apply max -> add suffix -> remove multi-index
df_clutch_count = df_breeding[['nest_id', 'year', 'clutch']
                             ].groupby(['nest_id', 'year']).max().add_suffix('_count').reset_index()

# get the annual stats per nest, year and clutch
temp = df_breeding[['nest_id', 'year', 'ActivityStatus', 'clutch', 'eggs', 'ChicksAlive', 'ChicksFledge', 'EggLayDate', 'ChicksAge', 'MassChick1', 'MassChick2', 'IDChick1', 'IDChick2']].copy()
df_breeding_gb = temp.groupby(['nest_id', 'year', 'clutch']).max().reset_index()
df_breeding_gb.rename(columns = {'eggs': 'egg_count', 'ChicksAlive': 'chick_count', 'ChicksFledge': 'fledge_count', 
                     'EggLayDate': 'lay_date', 'ChicksAge': 'age_at_fledging', 'MassChick1': 'mass_at_fletching_chick1', 
                     'MassChick2': 'mass_at_fletching_chick1', 'ActivityStatus': 'flag_activity_satus'}
          , inplace=True)
df_breeding_gb['flag_activity_satus'] = df_breeding_gb['flag_activity_satus'].apply(lambda x: x > 0)

write_temp_file(df_clutch_count, 'df_clutch_count.csv', 'df_clutch_count')
write_temp_file(df_breeding_gb, 'df_breeding_gb.csv', 'df_breeding_gb')
del temp

Wed Jan 25 13:28:01 2017 - Aggregating breeding data to get annual stats.
Wed Jan 25 13:28:01 2017 - Writing intermediate table df_clutch_count to disk.
Wed Jan 25 13:28:01 2017 - Written df_clutch_count.csv: 0.005 MB
Wed Jan 25 13:28:01 2017 - Writing intermediate table df_breeding_gb to disk.
Wed Jan 25 13:28:01 2017 - Written df_breeding_gb.csv: 0.015 MB


### 2.4.1 Load the temperature data
Read the temperature data file into memory and report on success/failure.
We maintain a shortcut: if the joined temp-humidity output file (csv) already exists then skip this step.

In [66]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), '- Combined temp and humidity file found. Skipping the temp data load.', flush=True)
else:
    read_file_handler_start(temperature_file, 'Temperature')
    column_names = ['recnum', 'datetime', 'temp_c', 'nest_id']
    data_types = {'recnum': np.int32, 
                  'datetime': str, 
                  'temp_c': np.float32, 
                  'nest_id': str}
    df_temp = pd.read_csv(temperature_file,
                          names=column_names,
                          usecols=[0,1,2,3],
                          dtype=data_types,
#                           nrows=10000,
                          parse_dates=['datetime'],
                          infer_datetime_format=True,
                          dayfirst=True,
                          encoding='utf-8',
                          error_bad_lines=True,
                          warn_bad_lines=True
                         )
    read_file_handler_end(temperature_file, 'Temperature', df_temp, 'df_temp')
    
    # make sure the nest IDs are all caps
    df_temp['nest_id'] = df_temp['nest_id'].apply(lambda x: x.strip()).apply(lambda x: x.upper())

Tue Jan 24 13:19:58 2017 - Loading the Temperature data file.
Tue Jan 24 13:19:58 2017 - Temperature file is 88.172 MB.
Tue Jan 24 13:19:58 2017 - Loading into memory. Please be patient.
Tue Jan 24 13:20:11 2017 - Success: loaded 2,169,903 records.


### 2.4.1 Load the humidity data
Read the humidity data file into memory and report on success/failure.
We maintain a shortcut: if the joined temp-humidity output file (csv) already exists then skip this step.

In [67]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), '- Combined temp and humidity file found. Skipping the humidity data load.', flush=True)
else:
    read_file_handler_start(humidity_file, 'Humidity')
    column_names = ['recnum', 'datetime', 'humidity', 'nest_id']
    data_types = {'recnum': np.int32, 
                  'datetime': str, 
                  'humidity': np.float32, 
                  'nest_id': str}
    df_humd = pd.read_csv(humidity_file,
                          names=column_names,
                          usecols=[0,1,2,3],
                          dtype=data_types,
#                          nrows=10000,               # for testing only
                          parse_dates=['datetime'],
                          infer_datetime_format=True,
                          dayfirst=True,
                          encoding='utf-8',
                          error_bad_lines=False,
                          warn_bad_lines=True
                         )

    read_file_handler_end(humidity_file, 'Humidity', df_humd, 'df_humd')
    
    # make sure the nest IDs are all caps
    df_humd['nest_id'] = df_humd['nest_id'].apply(lambda x: x.strip()).apply(lambda x: x.upper())

Tue Jan 24 13:33:22 2017 - Loading the Humidity data file.
Tue Jan 24 13:33:22 2017 - Humidity file is 93.235 MB.
Tue Jan 24 13:33:22 2017 - Loading into memory. Please be patient.
Tue Jan 24 13:33:35 2017 - Success: loaded 2,173,732 records.


## 3 Join the loaded data
### 3.1 Join the temperature and humidity data (creates df_sensor data)
We maintain a shortcut: if the joined temp-humidity output file (csv) already exists then skip this step.
Note that the same sensor records temp and humidity simultaneously, so the datetime stamps align and can be used in the join.

In [155]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), '- Combined temp and humidity file found. Skipping the temp and humidity data join.', flush=True)
else:
    print('\n{0} - Joining the temperature and humidity data sets.'.format(str(time.ctime())), flush=True)
    df_sensor_data = pd.merge(left=df_temp,
                            right=df_humd,
                            how='outer',
                            on=['nest_id', 'datetime'], # both have same keys
                            left_on=None, # same key names: don't need to specify R and L
                            right_on=None, # same key names: don't need to specify R and L
                            left_index=False, # dont' use left df index as key
                            right_index=False, # dont' use right df index as key
                            sort=True, # for efficiency do/not sort the df first
                            suffixes=['_temp', '_humd']
                            )[['nest_id', 'datetime', 'temp_c', 'humidity']] # take only these cols

    print('{0} - Join complete. Here are the stats:'.format(str(time.ctime())))
    print('Records in temperature data: {0:>20,}'.format(len(df_temp)))
    print('Records in humidity data:    {0:>20,}'.format(len(df_humd)))
    print('                              -------------------')
    print('Records in joined data:      {0:>20,}'.format(len(df_sensor_data)))
    print('\nOverview:')
    gb = df_sensor_data.groupby(['nest_id'])
    print('Number of nest_ids:          {0:>20,}\n'.format(len(gb)))
    del gb
    write_temp_file(df_sensor_data, joined_data_file, 'df_sensor_data')


Wed Jan 25 14:48:54 2017 - Joining the temperature and humidity data sets.
Wed Jan 25 14:48:55 2017 - Join complete. Here are the stats:
Records in temperature data:            2,169,903
Records in humidity data:               2,173,732
                              -------------------
Records in joined data:                 2,173,738

Overview:
Number of nest_ids:                           138

Wed Jan 25 14:48:56 2017 - Writing intermediate table df_sensor_data to disk.
Wed Jan 25 14:49:20 2017 - Written TempHumdCombined.csv: 148.404 MB


### 3.1.1 Shortcut: Load the joined temp and humidity csv
Do this if the dataframe (df_sensor_data) does not exist but the csv of it (created in a previous run of the script) does. 

In [146]:
# if we don't have a df_sensor_data but we do have the output already on disk, then load it. Else do nothing.
if df_sensor_data is None and os.path.isfile(joined_data_file):
    read_file_handler_start(joined_data_file, 'Combined temp and humidity')
    data_types = {'nest_id': str,
    #               'recnum': np.int32, 
                  'datetime': str, 
                  'temp_c': np.float32,
                  'humidity': np.float32, 
                  'breeding_year': np.float32}
    df_sensor_data = pd.read_csv(joined_data_file,
                                 dtype=data_types,
#                                  nrows=2048,               # for testing only
                                 parse_dates=['datetime'],
                                 dayfirst=True,
                                 encoding='utf-8',
                                 error_bad_lines=False,
                                 warn_bad_lines=True
                                )
    read_file_handler_end(joined_data_file, 'Combined temp and humidity', df_sensor_data, joined_data_file)

### 3.1.2 Calculations per-sensor reading
The following calculations are added per sensor reading:
* The `breeding_year`: same as the calendar year
* `temp_bucket` is a category for each 5C temperature range: <0, 0-5, .., 60+
* `humidity_bucket`: is a category for roughly 20% humidity ranges, based on human comfort zones
* `average_activity_phase`: the average activity conducted at the time of the observation

Note: An `actual_activity_phase` (the current phase of breeding based on per-nest observations) is added later

In [148]:
def temp_bucket(temp_c):
    result = None
    if temp_c < 0:
        result = 'temp_<0'
    elif temp_c >= 100:
        result = 'temp_100+'
    else:
        floor = (temp_c // 10) * 10
        ceiling = floor + 10
        result = 'temp_{0:.0f}-{1:.0f}'.format(floor, ceiling)
    return result

def humidity_bucket(humidity):
    '''
    Returns buckets every 20% from 0 to 160+
    '''
    result = None
    if humidity < 20: # lung & eye irritation in humans
        result = 'RH%_<20'
    elif humidity >= 160: # dripping; probably underwater
        result = 'RH%_160+'
    else:
        floor = (humidity // 20) * 20
        ceiling = floor + 20
        result = 'RH%_{0:.0f}-{1:.0f}'.format(floor, ceiling)
    return result

def average_activity_phase(sensor_datetime):
    '''
    Returns the current phase of breeding based on per-nest observations. Phases are generally:
    1 Jan - 31 Mar: moulting
    1 Apr - 31 May: nest building
    1 Jun - 30 Jun: laying
    1 Jul - 7 Aug: incubating
    8 Aug - 30 Sep: rearing
    1 Oct - 30 Oct: fledging
    1 Nov - 31 Dec: post-fledging
    There can be two lays per season. The second lay is not considered in the average timeframes 
    above.
    '''
    if sensor_datetime is None:
        return None
    elif sensor_datetime.month >= 1 and sensor_datetime.month <= 3:
        return 'moulting'
    elif sensor_datetime.month >= 4 and sensor_datetime.month >= 5:
        return 'nest building'
    elif sensor_datetime.month == 6:
        return 'laying'
    elif sensor_datetime.month == 7:
        return 'incubating'
    elif sensor_datetime.month == 8 and date(sensor_datetime.year, sensor_datetime.month, sensor_datetime.day) <= date(sensor_datetime.year, 8, 7):
        return 'incubating'
    elif sensor_datetime.month == 8 and date(sensor_datetime.year, sensor_datetime.month, sensor_datetime.day) > date(sensor_datetime.year, 8, 7):
        return 'rearing'
    elif sensor_datetime.month >= 9:
        return 'rearing'
    elif sensor_datetime.month >= 10:
        return 'fledging'
    elif sensor_datetime.month in [11, 12]:
        return 'post-fledging'
    else:
        return 'unknown'

In [151]:
# Maintain a shortcut: the output is saved to csv, so if it exists from a previous run, then skip this step and load that instead.
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), '- Combined temp and humidity file found. Skipping calculated fields.', flush=True)
else:
    # add the breeding_year (same as calendar year): 
    print(str(time.ctime()), '- Calculating breeding year.', end='', flush=True)
    df_sensor_data['breeding_year'] = df_sensor_data['datetime'].apply(lambda x: x.year)
    print(' Done.', flush=True)

    # add the average breeding phases 
    print(str(time.ctime()), '- Calculating average activity periods.', end='', flush=True)
    df_sensor_data['average_activity_period'] = df_sensor_data['datetime'].apply(average_activity_phase)
    print(' Done.', flush=True)

    # Add flags for various temperature ranges. 
    # These are summed to give the amount of time in the temp band
    print(str(time.ctime()), '- Calculating temperature buckets.', end='', flush=True)
    df_sensor_data['temp_bucket'] = df_sensor_data['temp_c'].apply(temp_bucket)
    print(' Done.', flush=True)

    # # Add flags for various humidity ranges. 
    # # These are summed to give the amount of time in the humidity band
    print(str(time.ctime()), '- Calculating humidity buckets.', end='', flush=True)
    df_sensor_data['humidity_bucket'] = df_sensor_data['humidity'].apply(humidity_bucket)
    print(' Done.', flush=True)

Wed Jan 25 14:28:41 2017 - Calculating breeding year. Done.
Wed Jan 25 14:28:52 2017 - Calculating average activity periods. Done.
Wed Jan 25 14:29:02 2017 - Calculating temperature buckets. Done.
Wed Jan 25 14:29:08 2017 - Calculating humidity buckets. Done.


In [152]:
# write the new joined-and-augmented sendor dataframe to csv as a shortcut for next time.
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), 'Combined temp and humidity file found. Skipping writing the combined data to file.', flush=True)
else:
    print('\n{0} - Writing the joined dataset to csv.'.format(str(time.ctime())), flush=True)
    df_sensor_data.to_csv(path_or_buf=joined_data_file,
                     sep=',',
                     na_rep='',
                     float_format='%.3f',
                     header=True,
                     index=False,
                     mode='w',
                     encoding='utf-8')
    print('{0} - File written: {1} ({2:.1f}MB)'.format(str(time.ctime()), str(joined_data_file), os.path.getsize(joined_data_file)/1000000), flush=True)

Wed Jan 25 14:29:37 2017 Combined temp and humidity file found. Skipping writing the combined data to file.


### 3.2 Join the NestCharacteristic Static and Seasonal data
`df_nest_seasonal` + `df_nest_static` -> `df_nest_joined`

Seasonal is `left` and Static is `right`, such that the Seasonal data is augmented with the nests static metadata.

In [139]:
print('{0} Joining the Nest Characteristic (Seasonal and Static) data sets .'.format(str(time.ctime())), flush=True)
df_nest_joined = pd.merge(left=df_nest_seasonal,
                            right=df_nest_static,
                            how='left',
                            on=['nest_id'], # both have same keys
                            left_on=None, # same key names: don't need to specify R and L
                            right_on=None, # same key names: don't need to specify R and L
                            left_index=False, # dont' use left df index as key
                            right_index=False, # dont' use right df index as key
                            sort=True, # for efficiency do/not sort the df first
                            suffixes=['_seasonal', '_static']
                            )
if df_nest_joined is not None:
    print('{0} - Join complete. Here are the stats:'.format(str(time.ctime())))
    print('Records in seasonal data:    {0:>20,}'.format(len(df_nest_seasonal)))
    print('Records in static data:      {0:>20,}'.format(len(df_nest_static)))
    print('                              -------------------')
    print('Records in joined data:      {0:>20,}'.format(len(df_nest_joined)))
    print('\nOverview:')
    gb = df_nest_joined.groupby(['nest_id'])
    print('Number of nest_ids:          {0:>20,}'.format(len(gb)))
    write_temp_file(df_nest_joined, 'df_nest_joined.csv', 'df_nest_joined')
    del gb
else:
    print('{0} - JOIN FAILED!!!.'.format(str(time.ctime())), flush=True)

Wed Jan 25 13:28:38 2017 Joining the Nest Characteristic (Seasonal and Static) data sets .
Wed Jan 25 13:28:38 2017 - Join complete. Here are the stats:
Records in seasonal data:                   1,711
Records in static data:                       241
                              -------------------
Records in joined data:                     1,711

Overview:
Number of nest_ids:                           193
Wed Jan 25 13:28:38 2017 - Writing intermediate table df_nest_joined to disk.
Wed Jan 25 13:28:38 2017 - Written df_nest_joined.csv: 0.455 MB


### 3.3 Join the breeding stats together
`df_breeding_gb + df_clutch_count -> df_breeding_annual_stats`

Clutch counts per year and annual clutch survival stats.
Note that the reduced record count in `df_breeding_annual_stats` compared to `df_clutch_count` is due to a number of nest-years having blank/zero clutches. These are excluded.

In [140]:
# join the clutch count on to the annual stats
print('\n{0} - Merging the aggregated breeding stats.'.format(str(time.ctime())), flush=True)
df_breeding_annual_stats = pd.merge(left=df_breeding_gb,
                                    right=df_clutch_count,
                                    how='left',
                                    on=['nest_id', 'year'], # both have same keys
                                    sort=True # for efficiency do/not sort the df first
                                   )

print('{0} - Join complete. Here are the stats:'.format(str(time.ctime())))
print('Records in annual stats data: {0:>20,}'.format(len(df_breeding_gb)))
print('Records in clutch count data: {0:>20,}'.format(len(df_clutch_count)))
print('                              -------------------')
print('Records in joined data:       {0:>20,}'.format(len(df_breeding_annual_stats)))
print('\nOverview:')
gb = df_breeding_annual_stats.groupby(['nest_id'])
print('Number of nest_ids in clutch count:   {0:>12,}'.format(len(df_clutch_count.groupby(['nest_id']))))
print('Number of nest_ids in breeding stats: {0:>12,}'.format(len(df_breeding_gb.groupby(['nest_id']))))
print('Number of nest_ids in joined:         {0:>12,}'.format(len(df_breeding_annual_stats.groupby(['nest_id']))))
write_temp_file(df_breeding_annual_stats, 'df_breeding_annual_stats.csv', 'df_breeding_annual_stats')
del gb


Wed Jan 25 13:28:44 2017 - Merging the aggregated breeding stats.
Wed Jan 25 13:28:44 2017 - Join complete. Here are the stats:
Records in annual stats data:                  267
Records in clutch count data:                  302
                              -------------------
Records in joined data:                        267

Overview:
Number of nest_ids in clutch count:            129
Number of nest_ids in breeding stats:          121
Number of nest_ids in joined:                  121
Wed Jan 25 13:28:44 2017 - Writing intermediate table df_breeding_annual_stats to disk.
Wed Jan 25 13:28:44 2017 - Written df_breeding_annual_stats.csv: 0.016 MB


### 3.4 Join the Nest data (seasonal and static) to the Breeding stats
`df_nest_joined + df_breeding_annual_stats -> df_nest_and_breeding`

In [141]:
# join the annual clutch and breeding stats onto the full seasonal and static nest data
print('\n{0} - Merging the aggregated breeding stats to the static and seasonal nest data.'.format(str(time.ctime())), flush=True)
df_nest_and_breeding = pd.merge(left=df_nest_joined,
                                right=df_breeding_annual_stats,
                                how='left',
                                on=['nest_id','year'], # both have same keys
                                sort=True 
                               )

print('{0} - Join complete. Here are the stats:'.format(str(time.ctime())))
print('Records in nest data:                 {0:>12,}'.format(len(df_nest_joined)))
print('Records in breeding stats data:       {0:>12,}'.format(len(df_breeding_annual_stats)))
print('                                      ------------')
print('Records in joined data:               {0:>12,}'.format(len(df_nest_and_breeding)))
print('\nOverview:')
gb = df_breeding_annual_stats.groupby(['nest_id'])
print('Number of nest_ids in nest data:      {0:>12,}'.format(len(df_nest_joined.groupby(['nest_id']))))
print('Number of nest_ids in breeding stats: {0:>12,}'.format(len(df_breeding_annual_stats.groupby(['nest_id']))))
print('Number of nest_ids in joined:         {0:>12,}'.format(len(df_nest_and_breeding.groupby(['nest_id']))))
write_temp_file(df_nest_and_breeding, 'NestDataWithBreedingStats.csv', 'df_nest_and_breeding')
del gb


Wed Jan 25 13:28:52 2017 - Merging the aggregated breeding stats to the static and seasonal nest data.
Wed Jan 25 13:28:52 2017 - Join complete. Here are the stats:
Records in nest data:                        1,711
Records in breeding stats data:                267
                                      ------------
Records in joined data:                      1,829

Overview:
Number of nest_ids in nest data:               193
Number of nest_ids in breeding stats:          121
Number of nest_ids in joined:                  193
Wed Jan 25 13:28:52 2017 - Writing intermediate table df_nest_and_breeding to disk.
Wed Jan 25 13:28:52 2017 - Written df_nest_and_breeding.csv: 0.528 MB


### 3.5 Aggregate the sensor data into stats per breeding phase
To understand the effect of nest conditions (from sensor data) in the choice of nest and breeding success of the nest, we need to break up the stats into:
* *annual stats* which represent the averages, spikes etc for the entire year. These give an understanding of the nest itself.
* *phase stats* which represent the conditions during specific phases of the breeding cycle. E.g. during nesting, during incubation, during rearing. To get these phase stats, we need to get the phase boundary dates from the breeding observation data.

The nest sensor readings are aggregated to summarise the nest conditions by `nest`, `breeding_year` and `activity_phase`.

### 3.5.1 Get the actual breeding phase dates
Summarise the breeding data to obtain the following:
* list of all nests (regardless of breeding activity)
* the `nesting_date` for each nest in each year
* the `egg_lay_date` for each nest, year and clutch
* the `hatch_date` for each nest, year and clutch
* the `fledge_date` for each nest, year and clutch

Join these all back together to get the phase dates all in one place, then join the combined result on to the sensor data table and calculate the phase in which each sensor reading occurred.
This will take a while.

**Issue: Nesting dates dont work: the second clutch will have first nesting date and the first obs for many nests is after the lay date, so nesting_date > lay_date**

In [142]:
# for each nest, year and clutch, get the following:
# first activity_status date (nesting_date), EggLayDate, hatch_date, fledge_date
# nesting_date, hatch_date, fledge_date are the min observation_date per nest, year, clutch where the value is not NaN

print('{0} - Calculating the breeding phase dates for each nest and year.'.format(str(time.ctime())), flush=True)
# all observed nests
df_all_nests = df_nest_static[['nest_id']].drop_duplicates()
write_temp_file(df_all_nests, 'df_all_nests.csv', 'df_all_nests')

# egg_lay_date
gb_lay_date = df_breeding[['nest_id', 'year', 'clutch', 'EggLayDate']
                         ].groupby(['nest_id', 'year', 'clutch']).min().reset_index()
gb_lay_date.rename(columns={'EggLayDate': 'egg_lay_date'}, inplace=True)
# nesting date: 31 days before egg_lay_date
gb_lay_date['courting_date'] = gb_lay_date['egg_lay_date'] - datetime.timedelta(days=31)
write_temp_file(gb_lay_date, 'gb_lay_date.csv', 'gb_lay_date')

# hatch_date
def hatch_date(row):
    return row['observation_date'] - datetime.timedelta(days=row['ChicksAge'])
# get the observation date (select columns)                                                 where age is not blank (i.e. they're there)
gb_hatch_date = df_breeding[['nest_id', 'year', 'clutch', 'observation_date', 'ChicksAge']].loc[df_breeding['ChicksAge'].notnull()]
gb_hatch_date['hatch_date'] = gb_hatch_date.apply(hatch_date, axis=1)
# get the min hatch_date 
gb_hatch_date = gb_hatch_date[['nest_id', 'year', 'clutch', 'hatch_date']].groupby(['nest_id', 'year', 'clutch']).min().reset_index()
write_temp_file(gb_hatch_date, 'gb_hatch_date.csv', 'gb_hatch_date')

# fledge_date
# is either the date that the chicks were of age and no longer observed in the nest, or were observed dead
# get the observation date (select columns) where there is a fledge flag
gb_fledge_date = df_breeding[['nest_id', 'year', 'clutch', 'observation_date', 'ChicksAlive', 'ChicksDead', 'ChicksFledge']].fillna(0)
gb_fledge_date['dead_or_fledged'] = gb_fledge_date.apply(lambda row: row['ChicksFledge'] > 0 or (row['ChicksDead'] > 0 and row['ChicksAlive'] == 0), axis=1)
gb_fledge_date = gb_fledge_date.query('dead_or_fledged')
# get the min obs date, which is the earliest fledge recording (per clutch)
gb_fledge_date = gb_fledge_date[['nest_id', 'year', 'clutch', 'observation_date']].groupby(['nest_id', 'year', 'clutch']).min().reset_index()
# rename the obs date 
gb_fledge_date.rename(columns={'observation_date': 'dead_or_fledge_date'}, inplace=True)
write_temp_file(gb_fledge_date, 'gb_fledge_date.csv', 'gb_fledge_date')

# join the key date tables together
print('{0} - Merging the phase date tables.'.format(str(time.ctime())), flush=True)
df_phase_dates = pd.merge(left=df_all_nests, right=gb_lay_date, how='left', on=['nest_id'], sort=True)
df_phase_dates = pd.merge(left=df_phase_dates, right=gb_hatch_date, how='left', on=['nest_id', 'year', 'clutch'], sort=True)
df_phase_dates = pd.merge(left=df_phase_dates, right=gb_fledge_date, how='left', on=['nest_id', 'year', 'clutch'], sort=True)
write_temp_file(df_phase_dates, 'df_phase_dates.csv', 'df_phase_dates')

print('{0} - Pivot breeding data to get the clutch dates.'.format(str(time.ctime())), flush=True)
# get the required cols
df_clutch_pivot = gb_lay_date[['nest_id', 'year', 'clutch', 'egg_lay_date']].copy()
# we have to combine the index because pivot() does not like a multi-index
df_clutch_pivot['nestyear'] = df_clutch_pivot['nest_id'] + '-' + df_clutch_pivot['year'].apply(lambda x: str(x))
# drop the old index fields
df_clutch_pivot = df_clutch_pivot[['nestyear', 'clutch', 'egg_lay_date']]
# do the pivot to get the (up to three) clutch dates per nest and year
df_clutch_pivot = df_clutch_pivot.pivot(index='nestyear', columns='clutch')['egg_lay_date'].reset_index()
# rename and restore the indexes
df_clutch_pivot.rename(columns={1.0: 'clutch_1', 2.0: 'clutch_2', 3.0: 'clutch_3'}, inplace=True)
df_clutch_pivot['nest_id'] = df_clutch_pivot['nestyear'].apply(lambda x: x.split('-')[0])
df_clutch_pivot['breeding_year'] = df_clutch_pivot['nestyear'].apply(lambda x: int(x.split('-')[1]))
df_clutch_pivot = df_clutch_pivot[['nest_id', 'breeding_year', 'clutch_1', 'clutch_2', 'clutch_3']]
write_temp_file(df_clutch_pivot, 'df_clutch_pivot.csv', 'df_clutch_pivot')

Wed Jan 25 13:44:04 2017 - Calculating the breeding phase dates for each nest and year.
Wed Jan 25 13:44:04 2017 - Writing intermediate table df_all_nests to disk.
Wed Jan 25 13:44:04 2017 - Written df_all_nests.csv: 0.002 MB
Wed Jan 25 13:44:04 2017 - Writing intermediate table gb_lay_date to disk.
Wed Jan 25 13:44:04 2017 - Written gb_lay_date.csv: 0.010 MB
Wed Jan 25 13:44:04 2017 - Writing intermediate table gb_hatch_date to disk.
Wed Jan 25 13:44:04 2017 - Written gb_hatch_date.csv: 0.006 MB
Wed Jan 25 13:44:04 2017 - Writing intermediate table gb_fledge_date to disk.
Wed Jan 25 13:44:04 2017 - Written gb_fledge_date.csv: 0.003 MB
Wed Jan 25 13:44:04 2017 Merging the phase date tables.
Wed Jan 25 13:44:04 2017 - Writing intermediate table df_phase_dates to disk.
Wed Jan 25 13:44:04 2017 - Written df_phase_dates.csv: 0.016 MB


** Add clutch dates to the sensor data** 

The `sensor_data` are lacking a `clutch` number, which will create duplicates if we attempt to join on the phase dates. Get the clutch dates and join them into the `sensor_data`.

In [153]:
# to avoid making epic dupes, we need to first add the clutch number on to the sensor data table

print('{0} - Join the clutch dates to the sensor data.'.format(str(time.ctime())), flush=True)
# join on to the sensor data
df_sensor_clutch = pd.merge(left=df_sensor_data, 
                            right=df_clutch_pivot, 
                            how='left', 
                            on=['nest_id', 'breeding_year'],
                            sort=True
                           )
print('{0} - Done. Rows: {1:,}'.format(str(time.ctime()), len(df_sensor_clutch)), flush=True)

print('{0} - Assigning a clutch number to each sensor record. Be patient.'.format(str(time.ctime())), flush=True)
# flag each reading with a clutch number
def clutch_number(row):
    if pd.isnull(row['clutch_1']):
        # there are no breeding observations for this nest and year
        return 0
    else:
        # there is at least 1 clutch
        if pd.isnull(row['clutch_2']) or row['datetime'] < row['clutch_2']:
            # there was only a single clutch, or there were >1 but this reading was before the 2nd clutch
            return 1
        elif pd.isnull(row['clutch_3']) or (not pd.isnull(row['clutch_3']) and row['datetime'] < row['clutch_3']):
            # there is a 2nd clutch if we got this far. if there is no 3rd, or the reading is before the 3rd, then this is 2nd
            return 2
        else:
            # there is a 3rd clutch and the sensor reading is after the 3rd
            return 3

df_sensor_clutch['clutch_number'] = df_sensor_clutch.apply(lambda row: clutch_number(row), axis=1)
print('{0} - Done.'.format(str(time.ctime())), flush=True)
write_temp_file(df_sensor_clutch, 'df_sensor_clutch.csv', 'df_sensor_clutch')

Wed Jan 25 14:30:31 2017 - Join the clutch dates to the sensor data.
Wed Jan 25 14:30:31 2017 - Done. Rows: 2,173,738
Wed Jan 25 14:30:31 2017 - Assigning a clutch number to each sensor record. Be patient.
Wed Jan 25 14:34:38 2017 - Done.


### 3.5.2 Get the breeding phase against each sensor reading
1. Join the phase dates on to the sensor data
2. Use the phase date to calculate the breeding_phase for each sensor reading

In [156]:
print('{0} - Join the phase dates on to the sensor data.'.format(str(time.ctime())), flush=True)
df_sensor_phase = pd.merge(left=df_sensor_clutch,
                        right=df_phase_dates,
                        how='left',
                        left_on=['nest_id', 'breeding_year', 'clutch_number'], # same key names: don't need to specify R and L
                        right_on=['nest_id', 'year', 'clutch'], # same key names: don't need to specify R and L
                        sort=True # for efficiency do/not sort the df first
#                             suffixes=['_temp', '_humd']
                        )
print('{0} - Done. Rows: {1:,}'.format(str(time.ctime()), len(df_sensor_phase)), flush=True)

if os.path.isfile(os.path.normpath('df_sensor_phase.pkl')):
    print(str(time.ctime()), '- Combined breeding_phase for sensor readings file found. Skipping this calc and loading that file.', flush=True)
    df_sensor_phase = pd.read_pickle(os.path.normpath('df_sensor_phase.pkl'))
    print('{0} - Done.'.format(str(time.ctime())), flush=True)
else:
    print('{0} - Calculating the breeding_phase for each sensor reading. Be patient.'.format(str(time.ctime())), flush=True)
    # for each sensor reading, determine the breeding_phase:
    # 'courting' iff date between nesting_date and egg_lay_date
    # 'incubating' iff date between egg_lay_date and hatch_date
    # 'rearing' iff date between hatch_date adn fledge_date
    # 'courting' iff clutch < clutch_count and date between fledge_date and egg_lay_date
    # else 'unoccupied' 

    def breeding_phase(row):
        if pd.isnull(row['egg_lay_date']) or row['clutch_number'] == 0: 
            # no activity this year
            return 'unoccupied'

        elif pd.isnull(row['hatch_date']):
            # laid but never hatched
            if row['datetime'] <= row['egg_lay_date'] + datetime.timedelta(days=35):
                # this egg never hatches, but the current sensor period is incubation
                return 'incubating'
            else:
                # this egg never hatches, and the current sensor period is past the 35 day incubation period
                return 'unoccupied'

        elif pd.isnull(row['dead_or_fledge_date']): 
            # hatched but never fledged
            if row['datetime'] <= row['hatch_date'] + datetime.timedelta(days=80):
                # oldest chick at fledge was 77 days, so assume up to 80
                return 'rearing'
            else:
                # the chicks must be missing
                return 'unoccupied' 

        elif row['datetime'] < row['courting_date']:
            # no one has moved in yet
            return 'unoccupied'

        elif row['clutch_number'] == 1 and row['courting_date'] <= row['datetime'] <= row['egg_lay_date']:
            # for the first clutch, courting is 31 days prior to lay
            return 'courting'

        elif row['clutch_number'] > 1 and row['datetime'] <= row['egg_lay_date']:
            # consider it courting again between fledging and second clutch
            return 'courting'

        elif row['egg_lay_date'] <= row['datetime'] <= row['hatch_date']:
            return 'incubating'

        elif row['hatch_date'] <= row['datetime'] <= row['dead_or_fledge_date']:
            return 'rearing'

        elif row['datetime'] > row['dead_or_fledge_date']:
            return 'unoccupied'

        else:
            return 'undefined'

    df_sensor_phase['breeding_phase'] = df_sensor_phase.apply(lambda row: breeding_phase(row), axis=1)
    df_sensor_phase['nest_year'] = df_sensor_phase.apply(lambda row: '{0}-{1:.0f}'.format(row['nest_id'], row['year']), axis=1)
    print('{0} - Done.'.format(str(time.ctime())), flush=True)
    print('{0} - Writing to pickle for later.'.format(str(time.ctime())), flush=True)
    df_sensor_phase.to_pickle(os.path.normpath('df_sensor_phase.pkl'))
    print('{0} - Done.'.format(str(time.ctime())), flush=True)
    write_temp_file(df_sensor_phase, 'SensorDataWithBreedingPhase.csv', 'df_sensor_phase')

Wed Jan 25 14:51:52 2017 - Join the phase dates on to the sensor data.
Wed Jan 25 14:51:53 2017 - Done. Rows: 2,173,738
Wed Jan 25 14:51:53 2017 - Writing intermediate table df_sensor_phase to disk.
Wed Jan 25 14:52:54 2017 - Written df_sensor_phase.csv: 307.077 MB


**Do a clean up of dataframes that we'll no longer need.**

In [172]:
print('{0} - Cleaning up intermediate data tables...'.format(str(time.ctime())), flush=True)
del df_sensor_clutch
del gb_lay_date
del gb_hatch_date
del gb_fledge_date
del df_nest_joined
del df_breeding_annual_stats
del df_clutch_count
del df_breeding_gb
print('{0} - Done.'.format(str(time.ctime())), flush=True)

Wed Jan 25 15:45:50 2017 - Cleaning up intermediate data tables...
Wed Jan 25 15:45:50 2017 - Done.


### ----------------------------------------------------------------
# Dev and Test
### ----------------------------------------------------------------

### Release memory by deleting DFs that are no longer required

In [None]:
# do it here

### Calculate the annual microclimate stats for each nest
This is used to understand the annual nest output absed on it's characteristics in the breeding year

In [124]:
# df_sensor_phase = pd.read_pickle(os.path.normpath('..\\0_data\\df_sensor_phase.pkl'))

In [212]:
# get the YEARLY temp and humidity mean, min, max, stddev for each nest and year
temp_annual = df_sensor_phase[['nest_id', 'year', 'temp_c']].dropna().groupby(['nest_id', 'year']).agg([np.min, np.max, np.mean, np.std]).reset_index()
temp_annual.rename(columns={'temp_c': 'temp_annual_'}, inplace=True)
temp_annual.columns = list(map(''.join, temp_annual.columns.values))

humidity_annual = df_sensor_phase[['nest_id', 'year', 'humidity']].dropna().groupby(['nest_id', 'year']).agg([np.min, np.max, np.mean, np.std]).reset_index()
humidity_annual.rename(columns={'humidity': 'humidity_annual_'}, inplace=True)
humidity_annual.columns = list(map(''.join, humidity_annual.columns.values))

In [163]:
temp_annual.head(2)

Unnamed: 0,nest_id,year,temp_annual_amin,temp_annual_amax,temp_annual_mean,temp_annual_std
0,101,2013.0,8.59,37.599998,18.406204,4.666923
1,101,2014.0,8.59,43.580002,21.536072,5.532999


In [152]:
humidity_annual.head(3)

Unnamed: 0,nest_id,year,humidity_annual_amin,humidity_annual_amax,humidity_annual_mean,humidity_annual_std
0,101,2013.0,28.790001,107.029999,85.430153,15.742786
1,101,2014.0,26.030001,109.050003,75.252083,16.728451
2,10A,2014.0,26.07,126.050003,101.895477,25.549717


### Calculate the microclimate stats for each nest, year and clutch as well as per-phase 
This is used to understand how the microclimate affects the outcome of each clutch and nest selection during courting

In [213]:
# get the PHASE temp and humidity mean, min, max, stddev for each nest, year, clutch and phase
temp_phase = df_sensor_phase[['nest_id', 'year', 'clutch', 'breeding_phase', 'temp_c']].groupby(['nest_id', 'year', 'clutch', 'breeding_phase']).agg([np.min, np.max, np.mean, np.std]).reset_index()
temp_phase.rename(columns={'temp_c': 'temp_phase_'}, inplace=True)
temp_phase.columns = list(map(''.join, temp_phase.columns.values))

humidity_phase = df_sensor_phase[['nest_id', 'year', 'clutch', 'breeding_phase', 'humidity']].groupby(['nest_id', 'year', 'clutch', 'breeding_phase']).agg([np.min, np.max, np.mean, np.std]).reset_index()
humidity_phase.rename(columns={'humidity': 'humidity_phase_'}, inplace=True)
humidity_phase.columns = list(map(''.join, humidity_phase.columns.values))

# Make a dummy variable that we can sum to get the count of time at each bucket temp/humidity.
# Note that the sensor readings are taken every 15 mins, so a dummy value of 0.25 means the sum of bucket records
#   equals the total hours in that bucket.
temp = df_sensor_phase.copy()
temp['counter'] = 0.25

# get the temp and humidity buckets for each nest, year, clutch, phase
temp_phase_bucket = temp.pivot_table(values='counter', index=['nest_id', 'year', 'clutch', 'breeding_phase'], columns='temp_bucket', aggfunc=np.sum).reset_index()
temp_phase_bucket.rename(columns={'humidity': 'temp_phase_bucket_'}, inplace=True)
temp_phase_bucket.columns = list(map(''.join, temp_phase_bucket.columns.values))

humidity_phase_bucket = temp.pivot_table(values='counter', index=['nest_id', 'year', 'clutch', 'breeding_phase'], columns='humidity_bucket', aggfunc=np.sum).reset_index()
humidity_phase_bucket.rename(columns={'humidity': ' humidity_phase_bucket_'}, inplace=True)
humidity_phase_bucket.columns = list(map(''.join, humidity_phase_bucket.columns.values))

In [170]:
temp_phase.head(3)

Unnamed: 0,nest_id,year,clutch,breeding_phase,temp_phase_amin,temp_phase_amax,temp_phase_mean,temp_phase_std
0,101,2013.0,1.0,incubating,8.59,24.610001,16.033649,2.331077
1,101,2013.0,1.0,unoccupied,10.09,37.599998,20.032824,5.146225
2,101,2014.0,1.0,rearing,8.59,43.580002,21.536072,5.532999


In [171]:
humidity_phase.head(3)

Unnamed: 0,nest_id,year,clutch,breeding_phase,humidity_phase_amin,humidity_phase_amax,humidity_phase_mean,humidity_phase_std
0,101,2013.0,1.0,incubating,57.990002,103.489998,92.353508,6.944231
1,101,2013.0,1.0,unoccupied,28.790001,107.029999,80.68351,18.147106
2,101,2014.0,1.0,rearing,26.030001,109.050003,75.252083,16.728451


In [175]:
temp_phase_bucket.head(3)

Unnamed: 0,nest_id,year,clutch,breeding_phase,temp_0-5,temp_10-15,temp_15-20,temp_20-25,temp_25-30,temp_30-35,temp_35-40,temp_40-45,temp_45-50,temp_5-10,temp_50-55,temp_55-60
0,101,2013.0,1.0,incubating,,317.75,675.25,53.25,,,,,,7.0,,
1,101,2013.0,1.0,unoccupied,,236.25,616.0,429.5,169.0,75.25,10.25,,,,,
2,101,2014.0,1.0,rearing,,180.75,639.25,719.0,320.25,123.25,43.25,2.25,,11.25,,


In [177]:
humidity_phase_bucket.head(3)

Unnamed: 0,nest_id,year,clutch,breeding_phase,RH%_100+,RH%_20-30,RH%_30-50,RH%_50-60,RH%_60-80,RH%_80-100,RH%_<20
0,101,2013.0,1.0,incubating,74.5,,,3.0,65.25,910.5,
1,101,2013.0,1.0,unoccupied,293.75,0.25,91.25,143.25,498.25,509.5,
2,101,2014.0,1.0,rearing,134.25,5.75,160.75,202.0,890.5,646.0,


In [214]:
df_nest_and_breeding.columns

Index(['type', 'nest_id', 'BoxSeasYear', 'date', 'year', 'season',
       'BoxCoverTotal', 'BoxCoverDead', 'BoxWood', 'BoxWoodDead', 'BoxVeg',
       'BoxVegDead', 'QuadCoverTotal', 'QuadCoverDead', 'QuadWood',
       'QuadWoodDead', 'QuadVeg', 'QuadVegDead', 'comments', 'nest_type',
       'distance_to_boardwalk_m', 'distance_to_vegetation_m',
       'distance_to_landfall', 'entrance_bearing', 'box_height_mm',
       'box_length_mm', 'box_width_mm', 'box_wall_width_mm', 'box_lid_depth',
       'internal_height_mm', 'internal_width_mm', 'internal_length_mm',
       'entrance_height', 'entrance_width', 'entrance_length', 'vents',
       'box_vol_L', 'box_area_cm2', 'box_has_tunnel', 'shape', 'elevation',
       'easting', 'northing', 'aspect', 'slope', 'duration_of_insolation',
       'comment', 'clutch', 'egg_count', 'chick_count', 'fledge_count',
       'lay_date', 'age_at_fledging', 'mass_at_fletching_chick1',
       'mass_at_fletching_chick1', 'IDChick1', 'IDChick2', 'clutch_count']

## Create the final aggregate table

Start by getting the required `nest_and_breeding` fields

### Prepare the final master table: df_microclimate_effects
This table becomes the master dataset for microclimate influences on breeding outcomes.
1. Use df_nest_and_breeding as the base (aggregated breeding stats with static and seasonal nest data)
2. Join to this the following
 1. temp_annual (annual temperature stats per nest)
 2. humidity_annual (annual humidity stats per nest)
 3. temp_phase (temp stats per nest, year, clutch and phase)
 4. humidity_phase (humidity stats per nest, year, clutch and phase)
 5. temp_phase_bucket (hours at each bucketed temp range per nest, year, clutch and phase)
 6. humidity_phase_bucket (hours at each bucketed humidity range per nest, year, clutch and phase)

In [217]:

df_microclimate_effects_annual = df_nest_and_breeding[['type', 'nest_id', 'year', 'clutch', 'date', 'season',
                                                            'BoxCoverTotal', 'BoxCoverDead', 'BoxWood', 'BoxWoodDead', 'BoxVeg', 'BoxVegDead', 
                                                            'QuadCoverTotal', 'QuadCoverDead', 'QuadWood', 'QuadWoodDead', 'QuadVeg', 'QuadVegDead', 'comments',  
                                                            'distance_to_boardwalk_m', 'distance_to_vegetation_m',
                                                            'distance_to_landfall', 'entrance_bearing', 'box_height_mm',
                                                            'box_length_mm', 'box_width_mm', 'box_wall_width_mm', 'box_lid_depth',
                                                            'internal_height_mm', 'internal_width_mm', 'internal_length_mm',
                                                            'entrance_height', 'entrance_width', 'entrance_length', 'vents',
                                                            'box_vol_L', 'box_area_cm2', 'box_has_tunnel', 'shape', 'elevation',
                                                            'easting', 'northing', 'aspect', 'slope', 'duration_of_insolation',
                                                            'comment', 'egg_count', 'chick_count',
                                                            'fledge_count', 'lay_date', 'age_at_fledging',
                                                            'mass_at_fletching_chick1', 'mass_at_fletching_chick1', 'IDChick1',
                                                            'IDChick2', 'clutch_count']].copy()
df_microclimate_effects_annual.rename(columns={'comments':'comments_veg', 'comment':'comments_geo'}, inplace=True)

### Combine `nest_and_breeding` with the sensor aggregate tables for the *annual stats*

In [220]:
temp_annual.head(3)

Unnamed: 0,nest_id,year,temp_annual_amin,temp_annual_amax,temp_annual_mean,temp_annual_std
0,101,2013.0,8.59,37.599998,18.406204,4.666923
1,101,2014.0,8.59,43.580002,21.536072,5.532999
2,10A,2014.0,7.6,42.110001,20.240257,5.32943


In [218]:
# join the sensor stats (annual and per-phase) onto the nest_and_breeding data
df_microclimate_effects_annual = pd.merge(left=df_microclimate_effects_annual,
                                   right=temp_annual,
                                   how='left',
                                   on=['nest_id', 'year'], # both have same keys
                                   left_on=None, # same key names: don't need to specify R and L
                                   right_on=None, # same key names: don't need to specify R and L
                                   left_index=False, # dont' use left df index as key
                                   right_index=False, # dont' use right df index as key
                                   sort=True, # for efficiency do/not sort the df first
                                   suffixes=['', '_temp_annual']
                                  )

In [192]:
df_microclimate_effects_annual = pd.merge(left=df_microclimate_effects_annual,
                                   right=humidity_annual,
                                   how='left',
                                   on=['nest_id', 'year', 'clutch'], # both have same keys
                                   left_on=None, # same key names: don't need to specify R and L
                                   right_on=None, # same key names: don't need to specify R and L
                                   left_index=False, # dont' use left df index as key
                                   right_index=False, # dont' use right df index as key
                                   sort=True, # for efficiency do/not sort the df first
                                   suffixes=['', '_humidity_annual']
                                  )

In [219]:
df_microclimate_effects_annual.head(20)

Unnamed: 0,type,nest_id,year,clutch,date,season,BoxCoverTotal,BoxCoverDead,BoxWood,BoxWoodDead,...,mass_at_fletching_chick1,mass_at_fletching_chick1.1,mass_at_fletching_chick1.2,IDChick1,IDChick2,clutch_count,temp_annual_amin,temp_annual_amax,temp_annual_mean,temp_annual_std
0,BOX,100,2013,1.0,2013-11-20,SPRING,4.0,,,,...,,,,,,1.0,,,,
1,BOX,100,2014,1.0,2014-10-07,SPRING,5.0,1.0,,,...,50.0,60.0,50.0,,,1.0,,,,
2,BOX,100,2014,1.0,2014-11-30,SPRING,5.0,1.0,,,...,50.0,60.0,50.0,,,1.0,,,,
3,BOX,100,2015,,2015-02-25,SUMMER,,,,,...,,,,,,,,,,
4,BOX,100,2015,,2015-05-13,AUTUMN,2.0,2.0,,,...,,,,,,,,,,
5,BOX,100,2015,,2015-08-25,WINTER,4.0,1.0,,,...,,,,,,,,,,
6,BOX,100,2015,,2015-11-23,SPRING,5.0,3.0,,,...,,,,,,,,,,
7,BOX,100,2016,,2016-02-16,SUMMER,5.0,3.0,,,...,,,,,,,,,,
8,BOX,100,2016,,2016-05-26,AUTUMN,5.0,2.0,,,...,,,,,,,,,,
9,BOX,100,2016,,2016-08-31,WINTER,4.0,2.0,,,...,,,,,,,,,,


### Combine `nest_and_breeding` with the sensor aggregate tables for the *breeding_phase stats*
Note that this will duplicate the annual stats rows for each breeding phase.

In [197]:
df_microclimate_effects_phase = pd.merge(left=df_microclimate_effects_annual,
                                   right=temp_phase,
                                   how='left',
                                   on=['nest_id', 'year', 'clutch'], # both have same keys
                                   left_on=None, # same key names: don't need to specify R and L
                                   right_on=None, # same key names: don't need to specify R and L
                                   left_index=False, # dont' use left df index as key
                                   right_index=False, # dont' use right df index as key
                                   sort=True, # for efficiency do/not sort the df first
                                   suffixes=['', '_temp_phase']
                                  )

In [195]:
df_microclimate_effects_phase = pd.merge(left=df_microclimate_effects_phase,
                                   right=humidity_phase,
                                   how='left',
                                   on=['nest_id', 'year', 'clutch', 'breeding_phase'], # both have same keys
                                   left_on=None, # same key names: don't need to specify R and L
                                   right_on=None, # same key names: don't need to specify R and L
                                   left_index=False, # dont' use left df index as key
                                   right_index=False, # dont' use right df index as key
                                   sort=True, # for efficiency do/not sort the df first
                                   suffixes=['', '_humidity_phase']
                                  )

In [199]:
df_microclimate_effects_phase.query('nest_id in ["101"]').head(10)

Unnamed: 0,type,nest_id,year,clutch,date,season,BoxCoverTotal,BoxCoverDead,BoxWood,BoxWoodDead,...,temp_annual_std,humidity_annual_amin,humidity_annual_amax,humidity_annual_mean,humidity_annual_std,breeding_phase,temp_phase_amin,temp_phase_amax,temp_phase_mean,temp_phase_std
20,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,incubating,8.59,24.610001,16.033649,2.331077
21,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,unoccupied,10.09,37.599998,20.032824,5.146225
22,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,incubating,8.59,24.610001,16.033649,2.331077
23,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,unoccupied,10.09,37.599998,20.032824,5.146225
24,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,incubating,8.59,24.610001,16.033649,2.331077
25,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,unoccupied,10.09,37.599998,20.032824,5.146225
26,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,incubating,8.59,24.610001,16.033649,2.331077
27,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,unoccupied,10.09,37.599998,20.032824,5.146225
28,BOX,101,2013,2.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,,,,,
29,BOX,101,2014,1.0,2014-10-07,SPRING,1,1.0,,,...,5.532999,26.030001,109.050003,75.252083,16.728451,rearing,8.59,43.580002,21.536072,5.532999


# -------------------------

In [None]:
# tempset = df_sensor_phase.query('nest_id in ["E10", "E13", "E14", "E4", "T5", "W2", "W6"]')

In [None]:
# TESTING ONLY

# temp = df_sensor_phase.query('nest_id in ["E10", "E13", "E14", "E4", "T5", "W2", "W6"]')
# temp = tempset.query('nest_id in ["E10", "E13", "E14", "E4", "T5", "W2", "W6"]')
# temp['date'] = temp['datetime'].apply(pd.datetools.normalize_date)
# temp = temp[['nest_id', 'date', 'breeding_year','clutch_number', 
#        'egg_lay_date', 'courting_date', 'hatch_date', 'dead_or_fledge_date',
#        'breeding_phase']]
# temp = temp.drop_duplicates()

# print(len(temp))
# temp.to_csv('sensor_phase_test.csv')
#E13 2014 egg never hatched:        works
#E10 2014 normal single fledge:     works
#W6 2014 chick ded:                 works

Hold off making the dummy columns until we need to do the stats. This keeps the file size down and lets us save the csv with buckets rather than dummies.

In [None]:
print(str(time.ctime()), 'Creating temp and humidity bucket dummy columns.', end='', flush=True)
df_joined = pd.get_dummies(data=df_joined, columns=['temp_bucket', 'humidity_bucket'])
print(' Done.', flush=True)

In [None]:
print(str(time.ctime()), 'Aggregating data by nest and year.', flush=True)

def percent_of_time(row):
    return 
# group the data by nest_id and breeding year to get the temp and humidity stats per year
temp_aggregations = {
    'temp_c': {
        'temp_count': 'count',
        'temp_avg': 'mean',
        'temp_min': 'min',
        'temp_max': 'max',
        'temp_std_dev': 'std'        
    },
    'humidity': {
        'humidity_count': 'count',
        'humidity_avg': 'mean',
        'humidity_min': 'min',
        'humidity_max': 'max',
        'humidity_std_dev': 'std'  
    },
    'temp_<0': {'bucket_total': 'sum'},
    'temp_0-5': {'bucket_total': 'sum'},
    'temp_5-10': {'bucket_total': 'sum'},
    'temp_10-15': {'bucket_total': 'sum'},
    'temp_15-20': {'bucket_total': 'sum'},
    'temp_20-25': {'bucket_total': 'sum'},
    'temp_25-30': {'bucket_total': 'sum'},
    'temp_30-35': {'bucket_total': 'sum'},
    'temp_35-40': {'bucket_total': 'sum'},
    'temp_40-45': {'bucket_total': 'sum'},
    'temp_45-50': {'bucket_total': 'sum'},
    'temp_50-55': {'bucket_total': 'sum'},
    'temp_55-60': {'bucket_total': 'sum'},
    'temp_60+': {'bucket_total': 'sum'}    
}
df_joined_gb = df_joined.groupby(['nest_id', 'breeding_year']).agg(temp_aggregations)
print(str(time.ctime()), 'Done.', flush=True)

### to add: 
* return the nest_ids and number and type of missing records

In [None]:
print(str(time.ctime()), 'Checking for missing data.', flush=True)
# check for missing temp or humidity readinga
def missing_data(row):
    if row['temp_c']['temp_count'] > row['humidity']['humidity_count']:
        return 'missing_humidity_data'
    elif row['temp_c']['temp_count'] < row['humidity']['humidity_count']:
        return 'missing_temp_data'
    else:
        return None
df_joined_gb['missing_data'] = df_joined_gb.apply(missing_data, axis=1)

print(str(time.ctime()), 'Done.', flush=True)

In [None]:
df_joined.head(10)

In [None]:
df_joined_gb.head(5)

In [None]:
# df_joined_gb['temp_25-30']['%time'] = df_joined_gb['temp_25-30']['bucket_total'] / df_joined_gb['temp_c']['temp_count']
df_joined_gb['temp_25-30_total'] = df_joined_gb['temp_25-30']['bucket_total']
df_joined_gb['temp_25-30_hours'] = df_joined_gb['temp_25-30_total'] / 4
df_joined_gb['temp_25-30_%'] = df_joined_gb['temp_25-30_total'] / df_joined_gb['temp_c']['temp_count']



In [None]:
df_joined_gb.head(5)

In [None]:
df_joined_gb.head(5)

The below sends the data to the PostGres DB.

Currently considering not using the DB at all. While the data maniopulation within the DB via SQL is far easier, keeping the whole project (data load, manipulate, graph) to a single platform and language is a priority.

In [None]:
# #sending temperature dataframe to the postgres DB
# print("Transferring temperature dataframe to DB..")
# df_temp.to_sql(con=engine, name='penguins_temperature', if_exists='replace')
# print("Uploaded successfully")

# #sending humidity dataframe to the postgres DB
# print("Transferring humidity dataframe to DB..")
# df_humd.to_sql(con=engine, name='penguins_humidity', if_exists='replace')
# print("Uploaded successfully")

# #sending nests dataframe to the postgres DB
# print("Transferring nests dataframe to DB..")
# nests_raw.to_sql(con=engine, name='penguins_nests', if_exists='replace')
# print("Uploaded successfully")