# 1B Enrich the data
This script will perform the following:
1. Load the relevant tables from pickle
2. Add calculated fields to the data to derive additional information
3. Write the output to csv and pickle for future use

## 0. Set up environment

In [2]:
# import required libraries
import pandas as pd
import os
import numpy as np
import datetime
import time

# all the useful and reuseable functions are defined in helper_functions.py
import helper_functions as h

# instantiate variables

# create helper functions required to be in the loacl module
def write_temp_file(df, filepath, df_name):
    '''
    If write_temps is true, this function will write the specified Pandas dataframe (df) to csv at the specified location (filepath).
    Variables:
        df: a Pandas dataframe to be written to csv.
        filepath: a string in Unix path format (using / not \) for the csv destination.
        df_name: human readable name or description of the dataframe for logging purposes.
    '''
    if write_temps:
        print('{0} - Writing intermediate table {1} to disk.'.format(str(time.ctime()), df_name, filepath), flush=True)
        df.to_csv(os.path.normpath(filepath))
        if os.path.getsize(filepath) > 0:
            print('{0} - Written {1}: {2:.3f} MB'.format(str(time.ctime()), filepath, os.path.getsize(filepath)/1000000), flush=True)

## 1. Load the Sensor Data
Give preference to the pickle (quick and maintains data types), failing that load from csv, else exit.

In [8]:
if os.path.isfile(os.path.normpath('df_sensor_phase.pkl')):
    h.log('Loading sensor data from pickle...')
    df_sensor_phase = pd.read_pickle(os.path.normpath('df_sensor_phase.pkl'))
    h.log('Done.')
    
elif os.path.isfile(os.path.normpath('SensorDataWithBreedingPhase.csv')):
    h.read_file_handler_start('SensorDataWithBreedingPhase.csv', 'sensor data')
    data_types = {'nest_id': str, 
                  'datetime': str, 
                  'temp_c': np.float32, 
                  'humidity': np.float32, 
                  'breeding_year': np.float32,
                  'average_activity_period': str, 
                  'temp_bucket': str, 
                  'humidity_bucket': str, 
                  'clutch_1': str,
                  'clutch_2': str, 
                  'clutch_3': str, 
                  'clutch_number': np.float32, 
                  'year': np.float32, 
                  'clutch': np.float32,
                  'egg_lay_date': str, 
                  'courting_date': str, 
                  'hatch_date': str, 
                  'dead_or_fledge_date': str
                 }
    df_sensor_phase = pd.read_csv('SensorDataWithBreedingPhase.csv',
                                  header=0,
                                  dtype=data_types,
                                  encoding='utf-8',
                                  parse_dates=['datetime', 'clutch_1', 'clutch_2', 'clutch_3', 
                                               'egg_lay_date', 'courting_date', 'hatch_date', 'dead_or_fledge_date'],
                                  dayfirst=True,
                                  infer_datetime_format=True,
                                  error_bad_lines=True,
                                  warn_bad_lines=True
                                 )
    h.read_file_handler_end(df_sensor_phase, 'sensor data', df_sensor_phase, 'df_sensor_phase')
    

Thu Jan 26 14:03:51 2017 - Loading sensor data from pickle...
Thu Jan 26 14:03:52 2017 - Done.


### 2. Get the breeding phase against each sensor reading
Use the phase date to calculate the breeding_phase for each sensor reading

In [9]:
h.log('Calculating the breeding_phase for each sensor reading. Be patient.')
# for each sensor reading, determine the breeding_phase:
# 'courting' iff date between nesting_date and egg_lay_date
# 'incubating' iff date between egg_lay_date and hatch_date
# 'rearing' iff date between hatch_date adn fledge_date
# 'courting' iff clutch < clutch_count and date between fledge_date and egg_lay_date
# else 'unoccupied' 

def breeding_phase(row):
    if pd.isnull(row['egg_lay_date']) or row['clutch_number'] == 0: 
        # no activity this year
        return 'unoccupied'

    elif pd.isnull(row['hatch_date']):
        # laid but never hatched
        if row['datetime'] <= row['egg_lay_date'] + datetime.timedelta(days=35):
            # this egg never hatches, but the current sensor period is incubation
            return 'incubating'
        else:
            # this egg never hatches, and the current sensor period is past the 35 day incubation period
            return 'unoccupied'

    elif pd.isnull(row['dead_or_fledge_date']): 
        # hatched but never fledged
        if row['datetime'] <= row['hatch_date'] + datetime.timedelta(days=80):
            # oldest chick at fledge was 77 days, so assume up to 80
            return 'rearing'
        else:
            # the chicks must be missing
            return 'unoccupied' 

    elif row['datetime'] < row['courting_date']:
        # no one has moved in yet
        return 'unoccupied'

    elif row['clutch_number'] == 1 and row['courting_date'] <= row['datetime'] <= row['egg_lay_date']:
        # for the first clutch, courting is 31 days prior to lay
        return 'courting'

    elif row['clutch_number'] > 1 and row['datetime'] <= row['egg_lay_date']:
        # consider it courting again between fledging and second clutch
        return 'courting'

    elif row['egg_lay_date'] <= row['datetime'] <= row['hatch_date']:
        return 'incubating'

    elif row['hatch_date'] <= row['datetime'] <= row['dead_or_fledge_date']:
        return 'rearing'

    elif row['datetime'] > row['dead_or_fledge_date']:
        return 'unoccupied'

    else:
        return 'undefined'

df_sensor_phase['breeding_phase'] = df_sensor_phase.apply(lambda row: breeding_phase(row), axis=1)
df_sensor_phase['nest_year'] = df_sensor_phase.apply(lambda row: '{0}-{1:.0f}'.format(row['nest_id'], row['year']), axis=1)
h.log('Done.')

Thu Jan 26 14:06:16 2017 - Calculating the breeding_phase for each sensor reading. Be patient.
Thu Jan 26 14:16:09 2017 - Done.


### Calculate the annual microclimate stats for each nest
This is used to understand the annual nest output absed on it's characteristics in the breeding year

In [124]:
# df_sensor_phase = pd.read_pickle(os.path.normpath('..\\0_data\\df_sensor_phase.pkl'))

In [212]:
# get the YEARLY temp and humidity mean, min, max, stddev for each nest and year
temp_annual = df_sensor_phase[['nest_id', 'year', 'temp_c']].dropna().groupby(['nest_id', 'year']).agg([np.min, np.max, np.mean, np.std]).reset_index()
temp_annual.rename(columns={'temp_c': 'temp_annual_'}, inplace=True)
temp_annual.columns = list(map(''.join, temp_annual.columns.values))

humidity_annual = df_sensor_phase[['nest_id', 'year', 'humidity']].dropna().groupby(['nest_id', 'year']).agg([np.min, np.max, np.mean, np.std]).reset_index()
humidity_annual.rename(columns={'humidity': 'humidity_annual_'}, inplace=True)
humidity_annual.columns = list(map(''.join, humidity_annual.columns.values))

In [163]:
temp_annual.head(2)

Unnamed: 0,nest_id,year,temp_annual_amin,temp_annual_amax,temp_annual_mean,temp_annual_std
0,101,2013.0,8.59,37.599998,18.406204,4.666923
1,101,2014.0,8.59,43.580002,21.536072,5.532999


In [152]:
humidity_annual.head(3)

Unnamed: 0,nest_id,year,humidity_annual_amin,humidity_annual_amax,humidity_annual_mean,humidity_annual_std
0,101,2013.0,28.790001,107.029999,85.430153,15.742786
1,101,2014.0,26.030001,109.050003,75.252083,16.728451
2,10A,2014.0,26.07,126.050003,101.895477,25.549717


### Calculate the microclimate stats for each nest, year and clutch as well as per-phase 
This is used to understand how the microclimate affects the outcome of each clutch and nest selection during courting

In [213]:
# get the PHASE temp and humidity mean, min, max, stddev for each nest, year, clutch and phase
temp_phase = df_sensor_phase[['nest_id', 'year', 'clutch', 'breeding_phase', 'temp_c']].groupby(['nest_id', 'year', 'clutch', 'breeding_phase']).agg([np.min, np.max, np.mean, np.std]).reset_index()
temp_phase.rename(columns={'temp_c': 'temp_phase_'}, inplace=True)
temp_phase.columns = list(map(''.join, temp_phase.columns.values))

humidity_phase = df_sensor_phase[['nest_id', 'year', 'clutch', 'breeding_phase', 'humidity']].groupby(['nest_id', 'year', 'clutch', 'breeding_phase']).agg([np.min, np.max, np.mean, np.std]).reset_index()
humidity_phase.rename(columns={'humidity': 'humidity_phase_'}, inplace=True)
humidity_phase.columns = list(map(''.join, humidity_phase.columns.values))

# Make a dummy variable that we can sum to get the count of time at each bucket temp/humidity.
# Note that the sensor readings are taken every 15 mins, so a dummy value of 0.25 means the sum of bucket records
#   equals the total hours in that bucket.
temp = df_sensor_phase.copy()
temp['counter'] = 0.25

# get the temp and humidity buckets for each nest, year, clutch, phase
temp_phase_bucket = temp.pivot_table(values='counter', index=['nest_id', 'year', 'clutch', 'breeding_phase'], columns='temp_bucket', aggfunc=np.sum).reset_index()
temp_phase_bucket.rename(columns={'humidity': 'temp_phase_bucket_'}, inplace=True)
temp_phase_bucket.columns = list(map(''.join, temp_phase_bucket.columns.values))

humidity_phase_bucket = temp.pivot_table(values='counter', index=['nest_id', 'year', 'clutch', 'breeding_phase'], columns='humidity_bucket', aggfunc=np.sum).reset_index()
humidity_phase_bucket.rename(columns={'humidity': ' humidity_phase_bucket_'}, inplace=True)
humidity_phase_bucket.columns = list(map(''.join, humidity_phase_bucket.columns.values))

In [170]:
temp_phase.head(3)

Unnamed: 0,nest_id,year,clutch,breeding_phase,temp_phase_amin,temp_phase_amax,temp_phase_mean,temp_phase_std
0,101,2013.0,1.0,incubating,8.59,24.610001,16.033649,2.331077
1,101,2013.0,1.0,unoccupied,10.09,37.599998,20.032824,5.146225
2,101,2014.0,1.0,rearing,8.59,43.580002,21.536072,5.532999


In [171]:
humidity_phase.head(3)

Unnamed: 0,nest_id,year,clutch,breeding_phase,humidity_phase_amin,humidity_phase_amax,humidity_phase_mean,humidity_phase_std
0,101,2013.0,1.0,incubating,57.990002,103.489998,92.353508,6.944231
1,101,2013.0,1.0,unoccupied,28.790001,107.029999,80.68351,18.147106
2,101,2014.0,1.0,rearing,26.030001,109.050003,75.252083,16.728451


In [175]:
temp_phase_bucket.head(3)

Unnamed: 0,nest_id,year,clutch,breeding_phase,temp_0-5,temp_10-15,temp_15-20,temp_20-25,temp_25-30,temp_30-35,temp_35-40,temp_40-45,temp_45-50,temp_5-10,temp_50-55,temp_55-60
0,101,2013.0,1.0,incubating,,317.75,675.25,53.25,,,,,,7.0,,
1,101,2013.0,1.0,unoccupied,,236.25,616.0,429.5,169.0,75.25,10.25,,,,,
2,101,2014.0,1.0,rearing,,180.75,639.25,719.0,320.25,123.25,43.25,2.25,,11.25,,


In [177]:
humidity_phase_bucket.head(3)

Unnamed: 0,nest_id,year,clutch,breeding_phase,RH%_100+,RH%_20-30,RH%_30-50,RH%_50-60,RH%_60-80,RH%_80-100,RH%_<20
0,101,2013.0,1.0,incubating,74.5,,,3.0,65.25,910.5,
1,101,2013.0,1.0,unoccupied,293.75,0.25,91.25,143.25,498.25,509.5,
2,101,2014.0,1.0,rearing,134.25,5.75,160.75,202.0,890.5,646.0,


In [214]:
df_nest_and_breeding.columns

Index(['type', 'nest_id', 'BoxSeasYear', 'date', 'year', 'season',
       'BoxCoverTotal', 'BoxCoverDead', 'BoxWood', 'BoxWoodDead', 'BoxVeg',
       'BoxVegDead', 'QuadCoverTotal', 'QuadCoverDead', 'QuadWood',
       'QuadWoodDead', 'QuadVeg', 'QuadVegDead', 'comments', 'nest_type',
       'distance_to_boardwalk_m', 'distance_to_vegetation_m',
       'distance_to_landfall', 'entrance_bearing', 'box_height_mm',
       'box_length_mm', 'box_width_mm', 'box_wall_width_mm', 'box_lid_depth',
       'internal_height_mm', 'internal_width_mm', 'internal_length_mm',
       'entrance_height', 'entrance_width', 'entrance_length', 'vents',
       'box_vol_L', 'box_area_cm2', 'box_has_tunnel', 'shape', 'elevation',
       'easting', 'northing', 'aspect', 'slope', 'duration_of_insolation',
       'comment', 'clutch', 'egg_count', 'chick_count', 'fledge_count',
       'lay_date', 'age_at_fledging', 'mass_at_fletching_chick1',
       'mass_at_fletching_chick1', 'IDChick1', 'IDChick2', 'clutch_count']

## Create the final aggregate table

Start by getting the required `nest_and_breeding` fields

### Prepare the final master table: df_microclimate_effects
This table becomes the master dataset for microclimate influences on breeding outcomes.
1. Use df_nest_and_breeding as the base (aggregated breeding stats with static and seasonal nest data)
2. Join to this the following
 1. temp_annual (annual temperature stats per nest)
 2. humidity_annual (annual humidity stats per nest)
 3. temp_phase (temp stats per nest, year, clutch and phase)
 4. humidity_phase (humidity stats per nest, year, clutch and phase)
 5. temp_phase_bucket (hours at each bucketed temp range per nest, year, clutch and phase)
 6. humidity_phase_bucket (hours at each bucketed humidity range per nest, year, clutch and phase)

In [217]:

df_microclimate_effects_annual = df_nest_and_breeding[['type', 'nest_id', 'year', 'clutch', 'date', 'season',
                                                            'BoxCoverTotal', 'BoxCoverDead', 'BoxWood', 'BoxWoodDead', 'BoxVeg', 'BoxVegDead', 
                                                            'QuadCoverTotal', 'QuadCoverDead', 'QuadWood', 'QuadWoodDead', 'QuadVeg', 'QuadVegDead', 'comments',  
                                                            'distance_to_boardwalk_m', 'distance_to_vegetation_m',
                                                            'distance_to_landfall', 'entrance_bearing', 'box_height_mm',
                                                            'box_length_mm', 'box_width_mm', 'box_wall_width_mm', 'box_lid_depth',
                                                            'internal_height_mm', 'internal_width_mm', 'internal_length_mm',
                                                            'entrance_height', 'entrance_width', 'entrance_length', 'vents',
                                                            'box_vol_L', 'box_area_cm2', 'box_has_tunnel', 'shape', 'elevation',
                                                            'easting', 'northing', 'aspect', 'slope', 'duration_of_insolation',
                                                            'comment', 'egg_count', 'chick_count',
                                                            'fledge_count', 'lay_date', 'age_at_fledging',
                                                            'mass_at_fletching_chick1', 'mass_at_fletching_chick1', 'IDChick1',
                                                            'IDChick2', 'clutch_count']].copy()
df_microclimate_effects_annual.rename(columns={'comments':'comments_veg', 'comment':'comments_geo'}, inplace=True)

### Combine `nest_and_breeding` with the sensor aggregate tables for the *annual stats*

In [220]:
temp_annual.head(3)

Unnamed: 0,nest_id,year,temp_annual_amin,temp_annual_amax,temp_annual_mean,temp_annual_std
0,101,2013.0,8.59,37.599998,18.406204,4.666923
1,101,2014.0,8.59,43.580002,21.536072,5.532999
2,10A,2014.0,7.6,42.110001,20.240257,5.32943


In [218]:
# join the sensor stats (annual and per-phase) onto the nest_and_breeding data
df_microclimate_effects_annual = pd.merge(left=df_microclimate_effects_annual,
                                   right=temp_annual,
                                   how='left',
                                   on=['nest_id', 'year'], # both have same keys
                                   left_on=None, # same key names: don't need to specify R and L
                                   right_on=None, # same key names: don't need to specify R and L
                                   left_index=False, # dont' use left df index as key
                                   right_index=False, # dont' use right df index as key
                                   sort=True, # for efficiency do/not sort the df first
                                   suffixes=['', '_temp_annual']
                                  )

In [192]:
df_microclimate_effects_annual = pd.merge(left=df_microclimate_effects_annual,
                                   right=humidity_annual,
                                   how='left',
                                   on=['nest_id', 'year', 'clutch'], # both have same keys
                                   left_on=None, # same key names: don't need to specify R and L
                                   right_on=None, # same key names: don't need to specify R and L
                                   left_index=False, # dont' use left df index as key
                                   right_index=False, # dont' use right df index as key
                                   sort=True, # for efficiency do/not sort the df first
                                   suffixes=['', '_humidity_annual']
                                  )

In [219]:
df_microclimate_effects_annual.head(20)

Unnamed: 0,type,nest_id,year,clutch,date,season,BoxCoverTotal,BoxCoverDead,BoxWood,BoxWoodDead,...,mass_at_fletching_chick1,mass_at_fletching_chick1.1,mass_at_fletching_chick1.2,IDChick1,IDChick2,clutch_count,temp_annual_amin,temp_annual_amax,temp_annual_mean,temp_annual_std
0,BOX,100,2013,1.0,2013-11-20,SPRING,4.0,,,,...,,,,,,1.0,,,,
1,BOX,100,2014,1.0,2014-10-07,SPRING,5.0,1.0,,,...,50.0,60.0,50.0,,,1.0,,,,
2,BOX,100,2014,1.0,2014-11-30,SPRING,5.0,1.0,,,...,50.0,60.0,50.0,,,1.0,,,,
3,BOX,100,2015,,2015-02-25,SUMMER,,,,,...,,,,,,,,,,
4,BOX,100,2015,,2015-05-13,AUTUMN,2.0,2.0,,,...,,,,,,,,,,
5,BOX,100,2015,,2015-08-25,WINTER,4.0,1.0,,,...,,,,,,,,,,
6,BOX,100,2015,,2015-11-23,SPRING,5.0,3.0,,,...,,,,,,,,,,
7,BOX,100,2016,,2016-02-16,SUMMER,5.0,3.0,,,...,,,,,,,,,,
8,BOX,100,2016,,2016-05-26,AUTUMN,5.0,2.0,,,...,,,,,,,,,,
9,BOX,100,2016,,2016-08-31,WINTER,4.0,2.0,,,...,,,,,,,,,,


### Combine `nest_and_breeding` with the sensor aggregate tables for the *breeding_phase stats*
Note that this will duplicate the annual stats rows for each breeding phase.

In [197]:
df_microclimate_effects_phase = pd.merge(left=df_microclimate_effects_annual,
                                   right=temp_phase,
                                   how='left',
                                   on=['nest_id', 'year', 'clutch'], # both have same keys
                                   left_on=None, # same key names: don't need to specify R and L
                                   right_on=None, # same key names: don't need to specify R and L
                                   left_index=False, # dont' use left df index as key
                                   right_index=False, # dont' use right df index as key
                                   sort=True, # for efficiency do/not sort the df first
                                   suffixes=['', '_temp_phase']
                                  )

In [195]:
df_microclimate_effects_phase = pd.merge(left=df_microclimate_effects_phase,
                                   right=humidity_phase,
                                   how='left',
                                   on=['nest_id', 'year', 'clutch', 'breeding_phase'], # both have same keys
                                   left_on=None, # same key names: don't need to specify R and L
                                   right_on=None, # same key names: don't need to specify R and L
                                   left_index=False, # dont' use left df index as key
                                   right_index=False, # dont' use right df index as key
                                   sort=True, # for efficiency do/not sort the df first
                                   suffixes=['', '_humidity_phase']
                                  )

In [199]:
df_microclimate_effects_phase.query('nest_id in ["101"]').head(10)

Unnamed: 0,type,nest_id,year,clutch,date,season,BoxCoverTotal,BoxCoverDead,BoxWood,BoxWoodDead,...,temp_annual_std,humidity_annual_amin,humidity_annual_amax,humidity_annual_mean,humidity_annual_std,breeding_phase,temp_phase_amin,temp_phase_amax,temp_phase_mean,temp_phase_std
20,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,incubating,8.59,24.610001,16.033649,2.331077
21,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,unoccupied,10.09,37.599998,20.032824,5.146225
22,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,incubating,8.59,24.610001,16.033649,2.331077
23,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,unoccupied,10.09,37.599998,20.032824,5.146225
24,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,incubating,8.59,24.610001,16.033649,2.331077
25,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,unoccupied,10.09,37.599998,20.032824,5.146225
26,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,incubating,8.59,24.610001,16.033649,2.331077
27,BOX,101,2013,1.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,unoccupied,10.09,37.599998,20.032824,5.146225
28,BOX,101,2013,2.0,2013-11-20,SPRING,4,,,,...,4.666923,28.790001,107.029999,85.430153,15.742786,,,,,
29,BOX,101,2014,1.0,2014-10-07,SPRING,1,1.0,,,...,5.532999,26.030001,109.050003,75.252083,16.728451,rearing,8.59,43.580002,21.536072,5.532999


# -------------------------

In [None]:
# tempset = df_sensor_phase.query('nest_id in ["E10", "E13", "E14", "E4", "T5", "W2", "W6"]')

In [None]:
# TESTING ONLY

# temp = df_sensor_phase.query('nest_id in ["E10", "E13", "E14", "E4", "T5", "W2", "W6"]')
# temp = tempset.query('nest_id in ["E10", "E13", "E14", "E4", "T5", "W2", "W6"]')
# temp['date'] = temp['datetime'].apply(pd.datetools.normalize_date)
# temp = temp[['nest_id', 'date', 'breeding_year','clutch_number', 
#        'egg_lay_date', 'courting_date', 'hatch_date', 'dead_or_fledge_date',
#        'breeding_phase']]
# temp = temp.drop_duplicates()

# print(len(temp))
# temp.to_csv('sensor_phase_test.csv')
#E13 2014 egg never hatched:        works
#E10 2014 normal single fledge:     works
#W6 2014 chick ded:                 works

Hold off making the dummy columns until we need to do the stats. This keeps the file size down and lets us save the csv with buckets rather than dummies.

In [None]:
print(str(time.ctime()), 'Creating temp and humidity bucket dummy columns.', end='', flush=True)
df_joined = pd.get_dummies(data=df_joined, columns=['temp_bucket', 'humidity_bucket'])
print(' Done.', flush=True)

In [None]:
print(str(time.ctime()), 'Aggregating data by nest and year.', flush=True)

def percent_of_time(row):
    return 
# group the data by nest_id and breeding year to get the temp and humidity stats per year
temp_aggregations = {
    'temp_c': {
        'temp_count': 'count',
        'temp_avg': 'mean',
        'temp_min': 'min',
        'temp_max': 'max',
        'temp_std_dev': 'std'        
    },
    'humidity': {
        'humidity_count': 'count',
        'humidity_avg': 'mean',
        'humidity_min': 'min',
        'humidity_max': 'max',
        'humidity_std_dev': 'std'  
    },
    'temp_<0': {'bucket_total': 'sum'},
    'temp_0-5': {'bucket_total': 'sum'},
    'temp_5-10': {'bucket_total': 'sum'},
    'temp_10-15': {'bucket_total': 'sum'},
    'temp_15-20': {'bucket_total': 'sum'},
    'temp_20-25': {'bucket_total': 'sum'},
    'temp_25-30': {'bucket_total': 'sum'},
    'temp_30-35': {'bucket_total': 'sum'},
    'temp_35-40': {'bucket_total': 'sum'},
    'temp_40-45': {'bucket_total': 'sum'},
    'temp_45-50': {'bucket_total': 'sum'},
    'temp_50-55': {'bucket_total': 'sum'},
    'temp_55-60': {'bucket_total': 'sum'},
    'temp_60+': {'bucket_total': 'sum'}    
}
df_joined_gb = df_joined.groupby(['nest_id', 'breeding_year']).agg(temp_aggregations)
print(str(time.ctime()), 'Done.', flush=True)

### to add: 
* return the nest_ids and number and type of missing records

In [None]:
print(str(time.ctime()), 'Checking for missing data.', flush=True)
# check for missing temp or humidity readinga
def missing_data(row):
    if row['temp_c']['temp_count'] > row['humidity']['humidity_count']:
        return 'missing_humidity_data'
    elif row['temp_c']['temp_count'] < row['humidity']['humidity_count']:
        return 'missing_temp_data'
    else:
        return None
df_joined_gb['missing_data'] = df_joined_gb.apply(missing_data, axis=1)

print(str(time.ctime()), 'Done.', flush=True)

In [None]:
df_joined.head(10)

In [None]:
df_joined_gb.head(5)

In [None]:
# df_joined_gb['temp_25-30']['%time'] = df_joined_gb['temp_25-30']['bucket_total'] / df_joined_gb['temp_c']['temp_count']
df_joined_gb['temp_25-30_total'] = df_joined_gb['temp_25-30']['bucket_total']
df_joined_gb['temp_25-30_hours'] = df_joined_gb['temp_25-30_total'] / 4
df_joined_gb['temp_25-30_%'] = df_joined_gb['temp_25-30_total'] / df_joined_gb['temp_c']['temp_count']



In [None]:
df_joined_gb.head(5)

In [None]:
df_joined_gb.head(5)

The below sends the data to the PostGres DB.

Currently considering not using the DB at all. While the data maniopulation within the DB via SQL is far easier, keeping the whole project (data load, manipulate, graph) to a single platform and language is a priority.

In [None]:
# #sending temperature dataframe to the postgres DB
# print("Transferring temperature dataframe to DB..")
# df_temp.to_sql(con=engine, name='penguins_temperature', if_exists='replace')
# print("Uploaded successfully")

# #sending humidity dataframe to the postgres DB
# print("Transferring humidity dataframe to DB..")
# df_humd.to_sql(con=engine, name='penguins_humidity', if_exists='replace')
# print("Uploaded successfully")

# #sending nests dataframe to the postgres DB
# print("Transferring nests dataframe to DB..")
# nests_raw.to_sql(con=engine, name='penguins_nests', if_exists='replace')
# print("Uploaded successfully")