# Data Preparation
The following need to be done:
* Load the relevant data sets from file
* Join them into a single data set
* Add additional computed features to the data
* Write the prepared data to file

### Set up the environment
We need a certain set of common libraries for the tasks to be performed. These are imported below. If an import statement errors, you will need to install the library in your environment using the command line command `pip install <library>`.

In [13]:
print('Setting up environment and variables.', flush=True)
import pandas as pd
import os
import numpy as np
from datetime import date
import time
import yaml
from sqlalchemy import create_engine

Setting up environment and variables.


### Set up the variables
Change the values of the variables below to suit the files (names and directory location) to be loaded.

In [14]:
## Currently in unix format as docker containers run on debian
config_file = os.path.normpath('../config.yml')
temperature_file = os.path.normpath('../0_data/TempData_2_10_2016.txt')
humidity_file = os.path.normpath('../0_data/HumidData_2_10_2016.txt')
joined_data_file = os.path.normpath('../0_data/TempHumdCombined.csv')

In [15]:
# initialise the config.yml file
# try:
#     with open(config_file, 'r') as ymlfile:
#         config = yaml.load(ymlfile)
# except IOError:
#     print('Config file can\'t be found.')

In [16]:
# set the database connection parameters based on the config.ini file
# if ymlfile is not None:
#     host = config['PostgreSQL']['host']
#     port = config['PostgreSQL']['port']
#     dbname = config['PostgreSQL']['dbname']
#     user  = config['PostgreSQL']['user']
#     password = config['PostgreSQL']['password']
    
#     # establish connection to the postgres database using the generated connection string
#     engine = create_engine(r"postgresql://"+user+":"+password+"@"+host+"/"+dbname)

## Load the temperature data
Read the temperature data file into memory and report on success/failure.

In [17]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), 'Combined temp and humidity file found. Skipping the temp data load.', flush=True)
else:
    column_names = ['recnum', 'datetime', 'temp_c', 'nest_id']
    data_types = {'recnum': np.int32, 
                  'datetime': str, 
                  'temp_c': np.float32, 
                  'nest_id': str}
    file_size = os.path.getsize(temperature_file)
    print('\n{0} Temperature is {1:.1f} MB.'.format(str(time.ctime()), 
                                                                                     (file_size/1000000)))

    if file_size > 5000000: # over 5mb
        print(str(time.ctime()), 'Loading into memory. Please be patient. ', flush=True)
    else:
        print(str(time.ctime()), 'Loading into memory. ', flush=True)

    df_temp = pd.read_csv(temperature_file,
                         names=column_names,
                         usecols=[0,1,2,3],
                         dtype=data_types,
#                          nrows=10000,               # for testing only
                          parse_dates=['datetime'],
                          dayfirst=True,
                          encoding='utf-8',
                          error_bad_lines=False,
                          warn_bad_lines=True
                         )

    if df_temp is not None:
        print(str(time.ctime()), 'Success: loaded {0:,} records.'.format(len(df_temp)))
    else:
        print(str(time.ctime()), '### FAILED! ###')
    
    # make sure the nest IDs are all caps
    df_temp['nest_id'] = df_temp['nest_id'].apply(lambda x: x.upper())


Tue Oct 11 11:58:03 2016 Temperature is 88.2 MB.
Tue Oct 11 11:58:03 2016 Loading into memory. Please be patient. 
Tue Oct 11 12:12:04 2016 Success: loaded 2,169,903 records.


In [18]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), 'Combined temp and humidity file found. Skipping the humidity data load.', flush=True)
else:
    column_names = ['recnum', 'datetime', 'humidity', 'nest_id']
    data_types = {'recnum': np.int32, 
                  'datetime': str, 
                  'humidity': np.float32, 
                  'nest_id': str}
    file_size = os.path.getsize(humidity_file)
    print('\n{0} Humidity file is {1:.1f} MB.'.format(str(time.ctime()),
                                                      (file_size/1000000)), flush=True)

    if file_size > 5000000: # over 5mb
        print(str(time.ctime()), 'Loading into memory. Please be patient. ', flush=True)
    else:
        print(str(time.ctime()), 'Loading into memory. ', flush=True, end='')

    df_humd = pd.read_csv(humidity_file,
                         names=column_names,
                         usecols=[0,1,2,3],
                         dtype=data_types,
#                          nrows=10000,               # for testing only
                          parse_dates=['datetime'],
                          dayfirst=True,
                          encoding='utf-8',
                          error_bad_lines=False,
                          warn_bad_lines=True
                         )

    if df_humd is not None:
        print(str(time.ctime()), 'Success: loaded {0:,} records.'.format(len(df_humd)))
    else:
        print(str(time.ctime()), '### FAILED! ###')
    
    # make sure the nest IDs are all caps
    df_humd['nest_id'] = df_humd['nest_id'].apply(lambda x: x.upper())


Tue Oct 11 12:12:06 2016 Humidity file is 93.2 MB.
Tue Oct 11 12:12:06 2016 Loading into memory. Please be patient. 
Tue Oct 11 12:26:27 2016 Success: loaded 2,173,732 records.


In [19]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), 'Combined temp and humidity file found. Skipping the temp and humidity data join.', flush=True)
else:
    print('\n{0} Joining the temperature and humidity data sets.'.format(str(time.ctime())), flush=True)
    df_joined = pd.merge(left=df_temp,
                            right=df_humd,
                            how='outer',
                            on=['nest_id', 'datetime'], # both have same keys
                            left_on=None, # same key names: don't need to specify R and L
                            right_on=None, # same key names: don't need to specify R and L
                            left_index=False, # dont' use left df index as key
                            right_index=False, # dont' use right df index as key
                            sort=True, # for efficiency do/not sort the df first
                            suffixes=['_temp', '_humd']
                            )[['nest_id', 'datetime', 'temp_c', 'humidity']] # take only these cols

    print('{0} Join complete. Here are the stats:'.format(str(time.ctime())))
    print('Records in temperature data: {0:>20,}'.format(len(df_temp)))
    print('Records in humidity data:    {0:>20,}'.format(len(df_humd)))
    print('                              -------------------')
    print('Records in joined data:      {0:>20,}'.format(len(df_joined)))
    print('\nOverview:')
    gb = df_joined.groupby(['nest_id'])
    print('Number of nest_ids:          {0:>20,}'.format(len(gb)))


Tue Oct 11 12:26:29 2016 Joining the temperature and humidity data sets.
Tue Oct 11 12:26:36 2016 Join complete. Here are the stats:
Records in temperature data:            2,169,903
Records in humidity data:               2,173,732
                              -------------------
Records in joined data:                 2,173,738

Overview:
Number of nest_ids:                           138


In [22]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), 'Combined temp and humidity file found. Skipping writing the combined data to file.', flush=True)
else:
    print('\n{0} Writing the joined dataset to csv.'.format(str(time.ctime())), flush=True)
    df_joined.to_csv(path_or_buf=joined_data_file,
                 sep=',',
                 na_rep='',
                 float_format='%.3f',
                 index=False,
                 mode='w',
                 encoding='utf-8')
    print('{0} File written: {1}'.format(str(time.ctime()), str(joined_data_file)), flush=True)


Tue Oct 11 12:35:12 2016 Writing the joined dataset to csv.
Tue Oct 11 12:35:57 2016 File written: ..\0_data\TempHumdCombined.csv


## Load the joined temp and humidity csv

In [23]:
if df_joined is None and os.path.isfile(joined_data_file):
    data_types = {'nest_id': str,
    #               'recnum': np.int32, 
                  'datetime': str, 
                  'temp_c': np.float32,
                  'humidity': np.float32, 
                  'breeding_year': np.int32}
    file_size = os.path.getsize(joined_data_file)
    print('\n{0} Combined temp and humidity file is {1:.1f} MB.'.format(str(time.ctime()), (file_size/1000000)))

    if file_size > 5000000: # over 5mb
        print(str(time.ctime()), 'Loading into memory. Please be patient. ', flush=True)
    else:
        print(str(time.ctime()), 'Loading into memory. ', flush=True, end='')

    df_joined = pd.read_csv(joined_data_file,
    #                      names=column_names,
    #                      usecols=[0,1,2,3],
                         dtype=data_types,
    #                      nrows=2048,               # for testing only
                          parse_dates=['datetime'],
                          dayfirst=True,
                          encoding='utf-8',
                          error_bad_lines=False,
                          warn_bad_lines=True
                         )

    if df_joined is not None:
        print(str(time.ctime()), 'Success: loaded {0:,} records.'.format(len(df_joined)))
    else:
        print(str(time.ctime()), '### FAILED! ###')

In [24]:
def temp_bucket(temp_c):
    result = None
    if temp_c < 0:
        result = 'temp_<0'
    elif temp_c >= 0 and temp_c < 5:
        result = 'temp_0-5'
    elif temp_c >= 5 and temp_c < 10:
        result = 'temp_5-10'
    elif temp_c >= 10 and temp_c < 15:
        result = 'temp_10-15'
    elif temp_c >= 15 and temp_c < 20:
        result = 'temp_15-20'
    elif temp_c >= 20 and temp_c < 25:
        result = 'temp_20-25'
    elif temp_c >= 25 and temp_c < 30:
        result = 'temp_25-30'
    elif temp_c >= 30 and temp_c < 35:
        result = 'temp_30-35'
    elif temp_c >= 35 and temp_c < 40:
        result = 'temp_35-40'
    elif temp_c >= 40 and temp_c < 45:
        result = 'temp_40-45'
    elif temp_c >= 45 and temp_c < 50:
        result = 'temp_45-50'
    elif temp_c >= 50 and temp_c < 55:
        result = 'temp_50-55'
    elif temp_c >= 55 and temp_c < 60:
        result = 'temp_55-60'
    elif temp_c >= 60:
        result = 'temp_60+'
    return result

def humidity_bucket(humidity):
    result = None
    if humidity < 20: # lung & eye irritation in humans
        result = 'RH%_<20'
    elif humidity >= 20 and humidity < 30: # lung irritation in humans
        result = 'RH%_20-30'
    elif humidity >= 30 and humidity < 50: # low but not dangerous to humans
        result = 'RH%_30-50'
    elif humidity >= 50 and humidity < 60: # human ideal comfort zone 
        result = 'RH%_50-60'
    elif humidity >= 60 and humidity < 80: # humid
        result = 'RH%_60-80'
    elif humidity >= 80 and humidity < 100: # v humid
        result = 'RH%_80-100'
    elif humidity >= 100: # dripping 
        result = 'RH%_100+'
    return result

# def breeding_phase(datetime):
#     '''
#     Given a datetime, returns the breeding phase occurring at that time:
#     1 Jan - 31 Mar: Moulting
#     1 Apr - 31 May: Nest Building
#     1 Jun - 
#     '''

## Calculations per-sensor reading
The following calculations are added per sensor reading:
* The `breeding_year`: same as the calendar year
* `temp_bucket` is a category for each 5C temperature range: <0, 0-5, .., 60+
* `humidity_bucket`: is a category for roughly 20% humidity ranges, based on human comfort zones

### To be added:
* `activity_phase`: Is the current phase of breeding based on per-nest observations. Phases are generally:
 * 1 Jan - 31 Mar: Moulting
 * 1 Apr - 31 May: Nest Building
 * 1 Jun - 30 Jun: Laying
 * 1 Jul - 7 Aug: Incubating
 * 8 Aug - 30 Sep: Rearing
 * 1 Oct - 30 Oct: Fledging
 * There can be two lays per season. The second lay is not considered in the above average timeframes but is recorded in the per-nest calculation

In [25]:
# add the breeding_year (same as financial year): 
print(str(time.ctime()), 'Calculating breeding year and activity periods.', flush=True)
df_joined['breeding_year'] = df_joined['datetime'].apply(lambda x: x.year)
# Were going to do static breeding phases but will do it instead by per nest. 
# Need to combine the nest observations per
# df_joined['breeding_phase'] = df_joined['datetime'].apply(breeding_phase)

# Add flags for various temperature ranges. 
# These are summed to give the amount of time in the temp band
print(str(time.ctime()), 'Calculating temperature buckets.', flush=True)
df_joined['temp_bucket'] = df_joined['temp_c'].apply(temp_bucket)

# Add flags for various humidity ranges. 
# These are summed to give the amount of time in the humidity band
print(str(time.ctime()), 'Calculating humidity buckets.', flush=True)
df_joined['humidity_bucket'] = df_joined['humidity'].apply(humidity_bucket)

print(str(time.ctime()), 'Creating temp and humidity bucket dummy columns', flush=True)
df_joined = pd.get_dummies(data=df_joined, columns=['temp_bucket', 'humidity_bucket'])

print(str(time.ctime()), 'Done', flush=True)

Tue Oct 11 12:39:19 2016 Calculating breeding year and activity periods.
Tue Oct 11 12:41:01 2016 Calculating temperature buckets.
Tue Oct 11 12:43:25 2016 Calculating humidity buckets.
Tue Oct 11 12:46:02 2016 Creating temp and humidity bucket dummy columns
Tue Oct 11 12:46:07 2016 Done


In [26]:
df_joined.head(10)

Unnamed: 0,nest_id,datetime,temp_c,humidity,breeding_year,temp_bucket_temp_0-5,temp_bucket_temp_10-15,temp_bucket_temp_15-20,temp_bucket_temp_20-25,temp_bucket_temp_25-30,...,temp_bucket_temp_50-55,temp_bucket_temp_55-60,temp_bucket_temp_60+,humidity_bucket_RH%_100+,humidity_bucket_RH%_20-30,humidity_bucket_RH%_30-50,humidity_bucket_RH%_50-60,humidity_bucket_RH%_60-80,humidity_bucket_RH%_80-100,humidity_bucket_RH%_<20
0,101,2013-07-11 21:49:00,15.1,91.949997,2013,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,101,2013-07-11 22:04:00,15.1,91.949997,2013,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,101,2013-07-11 22:19:00,15.61,91.519997,2013,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,101,2013-07-11 22:34:00,15.61,91.519997,2013,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,101,2013-07-11 22:49:00,15.61,91.519997,2013,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
5,101,2013-07-11 23:04:00,15.61,91.080002,2013,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
6,101,2013-07-11 23:19:00,15.61,91.080002,2013,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
7,101,2013-07-11 23:34:00,15.61,91.080002,2013,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
8,101,2013-07-11 23:49:00,15.61,91.080002,2013,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
9,101,2013-07-12 00:04:00,15.1,91.080002,2013,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


## Aggregation
The nest sensor readings are aggregated to summarise the nest conditions by `nest`, `breeding_year` and `activity_phase`.

### To be added
* Bring in `activity_phase` by joining in the breeding data prior to aggregation
* Aggregate separately and join back on to avoid ending up with a multi-dimensional table that is difficult to reference

In [None]:
print(str(time.ctime()), 'Aggregating data by nest and year.', flush=True)

def percent_of_time(row):
    return 
# group the data by nest_id and breeding year to get the temp and humidity stats per year
temp_aggregations = {
    'temp_c': {
        'temp_count': 'count',
        'temp_avg': 'mean',
        'temp_min': 'min',
        'temp_max': 'max',
        'temp_std_dev': 'std'        
    },
    'humidity': {
        'humidity_count': 'count',
        'humidity_avg': 'mean',
        'humidity_min': 'min',
        'humidity_max': 'max',
        'humidity_std_dev': 'std'  
    },
    'temp_<0': {'bucket_total': 'sum'},
    'temp_0-5': {'bucket_total': 'sum'},
    'temp_5-10': {'bucket_total': 'sum'},
    'temp_10-15': {'bucket_total': 'sum'},
    'temp_15-20': {'bucket_total': 'sum'},
    'temp_20-25': {'bucket_total': 'sum'},
    'temp_25-30': {'bucket_total': 'sum'},
    'temp_30-35': {'bucket_total': 'sum'},
    'temp_35-40': {'bucket_total': 'sum'},
    'temp_40-45': {'bucket_total': 'sum'},
    'temp_45-50': {'bucket_total': 'sum'},
    'temp_50-55': {'bucket_total': 'sum'},
    'temp_55-60': {'bucket_total': 'sum'},
    'temp_60+': {'bucket_total': 'sum'}    
}
df_joined_gb = df_joined.groupby(['nest_id', 'breeding_year']).agg(temp_aggregations)
print(str(time.ctime()), 'Done.', flush=True)

In [None]:
print(str(time.ctime()), 'Checking for missing data.', flush=True)
# check for missing temp or humidity readinga
def missing_data(row):
    if row['temp_c']['temp_count'] > row['humidity']['humidity_count']:
        return 'missing_humidity_data'
    elif row['temp_c']['temp_count'] < row['humidity']['humidity_count']:
        return 'missing_temp_data'
    else:
        return None
df_joined_gb['missing_data'] = df_joined_gb.apply(missing_data, axis=1)

print(str(time.ctime()), 'Done.', flush=True)

### ----------------------------------------------------------------
# Dev and Test
### ----------------------------------------------------------------

In [None]:
df_joined.head(10)

In [None]:
df_joined_gb.head(5)

In [None]:
# df_joined_gb['temp_25-30']['%time'] = df_joined_gb['temp_25-30']['bucket_total'] / df_joined_gb['temp_c']['temp_count']
df_joined_gb['temp_25-30_total'] = df_joined_gb['temp_25-30']['bucket_total']
df_joined_gb['temp_25-30_hours'] = df_joined_gb['temp_25-30_total'] / 4
df_joined_gb['temp_25-30_%'] = df_joined_gb['temp_25-30_total'] / df_joined_gb['temp_c']['temp_count']



In [None]:
df_joined_gb.head(5)

In [None]:
df_joined_gb.head(5)

The below sends the data to the PostGres DB.

Currently considering not using the DB at all. While the data maniopulation within the DB via SQL is far easier, keeping the whole project (data load, manipulate, graph) to a single platform and language is a priority.

In [None]:
#sending temperature dataframe to the postgres DB
print("Transferring temperature dataframe to DB..")
df_temp.to_sql(con=engine, name='penguins_temperature', if_exists='replace')
print("Uploaded successfully")

#sending humidity dataframe to the postgres DB
print("Transferring humidity dataframe to DB..")
df_humd.to_sql(con=engine, name='penguins_humidity', if_exists='replace')
print("Uploaded successfully")

#sending nests dataframe to the postgres DB
print("Transferring nests dataframe to DB..")
nests_raw.to_sql(con=engine, name='penguins_nests', if_exists='replace')
print("Uploaded successfully")