# Data Preparation
The following need to be done:
* Load the relevant data sets from file
* Join them into a single data set
* Add additional computed features to the data
* Write the prepared data to file

### Set up the environment
We need a certain set of common libraries for the tasks to be performed. These are imported below. If an import statement errors, you will need to install the library in your environment using the command line command `pip install <library>`.

In [60]:
print(str(time.ctime()), 'Setting up enviuronment and variables.')
import pandas as pd
import os
import numpy as np
from datetime import date


Mon Oct 10 13:43:54 2016 Setting up enviuronment and variables.


### Set up the variables
Change the values of the variables below to suit the files (names and directory location) to be loaded.

In [2]:
## Currently in unix format as docker containers run on debian
config_file = os.path.normpath('./config.yml')
temperature_file = os.path.normpath('./data/TempData_2_10_2016.txt')
humidity_file = os.path.normpath('./data/HumidData_2_10_2016.txt')

In [None]:
#initialise the config.yml file
with open(config_file, 'r') as ymlfile:
    config = yaml.load(ymlfile)

In [None]:
#set the database connection parameters based on the config.ini file
host = config['PostgreSQL']['host']
port = config['PostgreSQL']['port']
dbname = config['PostgreSQL']['dbname']
user  = config['PostgreSQL']['user']
password = config['PostgreSQL']['password']

In [None]:
#establish connection to the postgres database using the generated connection string
engine = create_engine(r"postgresql://"+user+":"+password+"@"+host+"/"+dbname)

## Load the temperature data
Read the temperature data file into memory and report on success/failure.

In [3]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), 'Combined temp and humidity file found. Skipping the temp data load.')
else:
    column_names = ['recnum', 'datetime', 'temp_c', 'nest_id']
    data_types = {'recnum': np.int32, 
                  'datetime': str, 
                  'temp_c': np.float32, 
                  'nest_id': str}
    file_size = os.path.getsize(temperature_file)
    print('\n{0} Loading temperature file into memory.\nFile is {1:.1f} MB.'.format(str(time.ctime()), 
                                                                                     (file_size/1000000)))

    if file_size > 5000000: # over 5mb
        print(str(time.ctime()), 'Loading into memory. Please be patient. ', flush=True)
    else:
        print(str(time.ctime()), 'Loading into memory. ', flush=True)

    df_temp = pd.read_csv(temperature_file,
                         names=column_names,
                         usecols=[0,1,2,3],
                         dtype=data_types,
    #                      nrows=2048,               # for testing only
                          parse_dates=['datetime'],
                          dayfirst=True,
                          encoding='utf-8',
                          error_bad_lines=False,
                          warn_bad_lines=True
                         )

    if df_temp is not None:
        print(str(time.ctime()), 'Success: loaded {0:,} records.'.format(len(df_temp)))
    else:
        print(str(time.ctime()), '### FAILED! ###')


Loading temperature file into memory.
File is 88.2 MB.
Loading into memory. Please be patient. 
Success: loaded 2,169,903 records.


In [4]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), 'Combined temp and humidity file found. Skipping the humidity data load.')
else:
    column_names = ['recnum', 'datetime', 'humidity', 'nest_id']
    data_types = {'recnum': np.int32, 
                  'datetime': str, 
                  'humidity': np.float32, 
                  'nest_id': str}
    file_size = os.path.getsize(humidity_file)
    print('\n{0} Loading humidity file into memory.\nFile is {1:.1f} MB.'.format(str(time.ctime()), 
                                                                                (file_size/1000000)))

    if file_size > 5000000: # over 5mb
        print(str(time.ctime()), 'Loading into memory. Please be patient. ', flush=True)
    else:
        print(str(time.ctime()), 'Loading into memory. ', flush=True, end='')

    df_humd = pd.read_csv(humidity_file,
                         names=column_names,
                         usecols=[0,1,2,3],
                         dtype=data_types,
    #                      nrows=2048,               # for testing only
                          parse_dates=['datetime'],
                          dayfirst=True,
                          encoding='utf-8',
                          error_bad_lines=False,
                          warn_bad_lines=True
                         )

    if df_humd is not None:
        print(str(time.ctime()), 'Success: loaded {0:,} records.'.format(len(df_humd)))
    else:
        print(str(time.ctime()), '### FAILED! ###')


Loading humidity file into memory.
File is 93.2 MB.
Loading into memory. Please be patient. 
Success: loaded 2,173,732 records.


In [5]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), 'Combined temp and humidity file found. Skipping the temp and humidity data join.')
else:
    print('\n{0} Joining the temperature and humidity data sets.'.format(str(time.ctime())))
    df_joined = pd.merge(left=df_temp,
                            right=df_humd,
                            how='outer',
                            on=['nest_id', 'datetime'], # both have same keys
                            left_on=None, # same key names: don't need to specify R and L
                            right_on=None, # same key names: don't need to specify R and L
                            left_index=False, # dont' use left df index as key
                            right_index=False, # dont' use right df index as key
                            sort=True, # for efficiency do/not sort the df first
                            suffixes=['_temp', '_humd']
                            )[['nest_id', 'datetime', 'temp_c', 'humidity']] # take only these cols

    print('{0} Join complete. Here are the stats:'.format(str(time.ctime())))
    print('Records in temperature data: {0:>20,}'.format(len(df_temp)))
    print('Records in humidity data:    {0:>20,}'.format(len(df_humd)))
    print('                              -------------------')
    print('Records in joined data:      {0:>20,}'.format(len(df_joined)))
    print('\nOverview:')
    gb = df_joined.groupby(['nest_id'])
    print('Number of nest_ids:          {0:>20,}'.format(len(gb)))

Records in temperature data:            2,169,903
Records in humidity data:               2,173,732
                              -------------------
Records in joined data:                 2,173,738

Overview:
Number of nest_ids:                           140


In [27]:
if os.path.isfile(joined_data_file):
    print(str(time.ctime()), 'Combined temp and humidity file found. Skipping writing the combined data to file.')
else:
    print('\n{0} Writing the joined dataset to csv.'.format(str(time.ctime())))
    df_joined.to_csv(path_or_buf=joined_data_file,
                 sep=',',
                 na_rep='',
                 float_format='%.3f',
                 index=False,
                 mode='w',
                 encoding='utf-8')
    print('{0} File written: {1}'.format(str(time.ctime())), str(joined_data_file))

## Load the joined temp and humidity csv

In [5]:
if df_joined is None and os.path.isfile(joined_data_file):
    data_types = {'nest_id': str,
    #               'recnum': np.int32, 
                  'datetime': str, 
                  'temp_c': np.float32,
                  'humidity': np.float32, 
                  'breeding_year': np.int32}
    file_size = os.path.getsize(joined_data_file)
    print('\n{0} Loading joined data file into memory.\nFile is {1:.1f} MB.'.format(str(time.ctime()), (file_size/1000000)))

    if file_size > 5000000: # over 5mb
        print(str(time.ctime()), 'Loading into memory. Please be patient. ', flush=True)
    else:
        print(str(time.ctime()), 'Loading into memory. ', flush=True, end='')

    df_joined = pd.read_csv(joined_data_file,
    #                      names=column_names,
    #                      usecols=[0,1,2,3],
                         dtype=data_types,
    #                      nrows=2048,               # for testing only
                          parse_dates=['datetime'],
                          dayfirst=True,
                          encoding='utf-8',
                          error_bad_lines=False,
                          warn_bad_lines=True
                         )

    if df_joined is not None:
        print(str(time.ctime()), 'Success: loaded {0:,} records.'.format(len(df_joined)))
    else:
        print(str(time.ctime()), '### FAILED! ###')


Loading joined data file into memory.
File is 95.3 MB.
Mon Oct 10 08:22:25 2016 Loading into memory. Please be patient. 
Mon Oct 10 08:22:36 2016 Success: loaded 2,173,738 records.


In [19]:
def temp_bucket(temp_c):
    result = None
    if temp_c < 0:
        result = 'temp_<0'
    elif temp_c >= 0 and temp_c < 5:
        result = 'temp_0-5'
    elif temp_c >= 5 and temp_c < 10:
        result = 'temp_5-10'
    elif temp_c >= 10 and temp_c < 15:
        result = 'temp_10-15'
    elif temp_c >= 15 and temp_c < 20:
        result = 'temp_15-20'
    elif temp_c >= 20 and temp_c < 25:
        result = 'temp_20-25'
    elif temp_c >= 25 and temp_c < 30:
        result = 'temp_25-30'
    elif temp_c >= 30 and temp_c < 35:
        result = 'temp_30-35'
    elif temp_c >= 35 and temp_c < 40:
        result = 'temp_35-40'
    elif temp_c >= 40 and temp_c < 45:
        result = 'temp_40-45'
    elif temp_c >= 45 and temp_c < 50:
        result = 'temp_45-50'
    elif temp_c >= 50 and temp_c < 55:
        result = 'temp_50-55'
    elif temp_c >= 55 and temp_c < 60:
        result = 'temp_55-60'
    elif temp_c >= 60:
        result = 'temp_60+'
    return result

def humidity_bucket(humidity):
    result = None
    if humidity < 20: # lung & eye irritation in humans
        result = 'RH%_<20'
    elif humidity >= 20 and humidity < 30: # lung irritation in humans
        result = 'RH%_20-30'
    elif humidity >= 30 and humidity < 50: # low but not dangerous to humans
        result = 'RH%_30-50'
    elif humidity >= 50 and humidity < 60: # human ideal comfort zone 
        result = 'RH%_50-60'
    elif humidity >= 60 and humidity < 80: # humid
        result = 'RH%_60-80'
    elif humidity >= 80 and humidity < 100: # v humid
        result = 'RH%_80-100'
    elif humidity >= 100: # dripping 
        result = 'RH%_100+'
    return result

In [21]:
# add the breeding_year (same as financial year): 
print(str(time.ctime()), 'Calculating breeding year.', flush=True)
df_joined['breeding_year'] = df_joined['datetime'].apply(lambda x: x.year if x.month < 7 else x.year + 1)

# Add flags for various temperature ranges. 
# These are summed to give the amount of time in the temp band
print(str(time.ctime()), 'Calculating temperature buckets.', flush=True)
df_joined['temp_bucket'] = df_joined['temp_c'].apply(temp_bucket)

# Add flags for various humidity ranges. 
# These are summed to give the amount of time in the humidity band
print(str(time.ctime()), 'Calculating humidity buckets.', flush=True)
df_joined['humidity_bucket'] = df_joined['humidity'].apply(humidity_bucket)

print(str(time.ctime()), 'Creating temp and humidity bucket dummy columns', flush=True)
df_joined = pd.get_dummies(data=df_joined, columns=['temp_bucket', 'humidity_bucket'])

print(str(time.ctime()), 'Done', flush=True)

Mon Oct 10 11:57:56 2016 Calculating breeding year.
Mon Oct 10 11:59:29 2016 Calculating temperature buckets.
Mon Oct 10 12:01:54 2016 Calculating humidity buckets.
Mon Oct 10 12:04:23 2016 Creating temp and humidity bucket dummy columns
Mon Oct 10 12:04:30 2016 Done


### ----------------------------------------------------------------
# Dev and Test
### ----------------------------------------------------------------

In [22]:
df_joined.head(10)

Unnamed: 0,nest_id,datetime,temp_c,humidity,breeding_year,temp_<0,temp_0-5,temp_5-10,temp_10-15,temp_15-20,...,temp_bucket_temp_50-55,temp_bucket_temp_55-60,temp_bucket_temp_60+,humidity_bucket_RH%_100+,humidity_bucket_RH%_20-30,humidity_bucket_RH%_30-50,humidity_bucket_RH%_50-60,humidity_bucket_RH%_60-80,humidity_bucket_RH%_80-100,humidity_bucket_RH%_<20
0,101,2013-07-11 21:49:00,15.1,91.949997,2014,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,101,2013-07-11 22:04:00,15.1,91.949997,2014,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,101,2013-07-11 22:19:00,15.61,91.519997,2014,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,101,2013-07-11 22:34:00,15.61,91.519997,2014,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,101,2013-07-11 22:49:00,15.61,91.519997,2014,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
5,101,2013-07-11 23:04:00,15.61,91.080002,2014,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
6,101,2013-07-11 23:19:00,15.61,91.080002,2014,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
7,101,2013-07-11 23:34:00,15.61,91.080002,2014,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
8,101,2013-07-11 23:49:00,15.61,91.080002,2014,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
9,101,2013-07-12 00:04:00,15.1,91.080002,2014,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [53]:
print(str(time.ctime()), 'Aggregating data by nest and year.', flush=True)

def percent_of_time(row):
    return 
# group the data by nest_id and breeding year to get the temp and humidity stats per year
temp_aggregations = {
    'temp_c': {
        'temp_count': 'count',
        'temp_avg': 'mean',
        'temp_min': 'min',
        'temp_max': 'max',
        'temp_std_dev': 'std'        
    },
    'humidity': {
        'humidity_count': 'count',
        'humidity_avg': 'mean',
        'humidity_min': 'min',
        'humidity_max': 'max',
        'humidity_std_dev': 'std'  
    },
    'temp_<0': {'bucket_total': 'sum'},
    'temp_0-5': {'bucket_total': 'sum'},
    'temp_5-10': {'bucket_total': 'sum'},
    'temp_10-15': {'bucket_total': 'sum'},
    'temp_15-20': {'bucket_total': 'sum'},
    'temp_20-25': {'bucket_total': 'sum'},
    'temp_25-30': {'bucket_total': 'sum'},
    'temp_30-35': {'bucket_total': 'sum'},
    'temp_35-40': {'bucket_total': 'sum'},
    'temp_40-45': {'bucket_total': 'sum'},
    'temp_45-50': {'bucket_total': 'sum'},
    'temp_50-55': {'bucket_total': 'sum'},
    'temp_55-60': {'bucket_total': 'sum'},
    'temp_60+': {'bucket_total': 'sum'}    
}
df_joined_gb = df_joined.groupby(['nest_id', 'breeding_year']).agg(temp_aggregations)
print(str(time.ctime()), 'Done.', flush=True)

Mon Oct 10 13:31:45 2016 Aggregating data by nest and year.
Mon Oct 10 13:31:49 2016 Done.


In [54]:
df_joined_gb.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,temp_0-5,temp_25-30,temp_<0,temp_55-60,temp_50-55,temp_40-45,humidity,humidity,humidity,humidity,...,temp_c,temp_c,temp_c,temp_15-20,temp_30-35,temp_10-15,temp_45-50,temp_5-10,temp_60+,temp_20-25
Unnamed: 0_level_1,Unnamed: 1_level_1,bucket_total,bucket_total,bucket_total,bucket_total,bucket_total,bucket_total,humidity_avg,humidity_std_dev,humidity_max,humidity_count,...,temp_count,temp_std_dev,temp_min,bucket_total,bucket_total,bucket_total,bucket_total,bucket_total,bucket_total,bucket_total
nest_id,breeding_year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
101,2014,0,1957,0,0,0,9,80.867279,16.991467,109.050003,18349,...,18349,5.294471,8.59,7653,794,2865,0,66,0,4791
101,2015,0,0,0,0,0,0,89.656807,8.522756,107.029999,166,...,166,3.505429,9.09,69,0,74,0,7,0,16
102,2014,0,1652,0,0,5,243,74.126831,18.788078,101.620003,18349,...,18349,6.762604,6.56,6439,1022,3543,38,273,0,4590
102,2015,0,0,0,0,0,0,86.626266,8.220601,101.620003,166,...,166,3.70545,7.56,90,0,40,0,31,0,5
103,2015,0,416,0,0,0,0,80.052597,18.175327,109.910004,5415,...,5415,5.192024,7.08,2186,165,1212,0,71,0,1323


In [43]:
print(str(time.ctime()), 'Checking for missing data.', flush=True)
# check for missing temp or humidity readinga
def missing_data(row):
    if row['temp_c']['temp_count'] > row['humidity']['humidity_count']:
        return 'missing_humidity_data'
    elif row['temp_c']['temp_count'] < row['humidity']['humidity_count']:
        return 'missing_temp_data'
    else:
        return None
df_joined_gb['missing_data'] = df_joined_gb.apply(missing_data, axis=1)

print(str(time.ctime()), 'Done.', flush=True)

Mon Oct 10 12:57:26 2016 Checking for missing data.
Mon Oct 10 12:57:27 2016 Done.


In [58]:
# df_joined_gb['temp_25-30']['%time'] = df_joined_gb['temp_25-30']['bucket_total'] / df_joined_gb['temp_c']['temp_count']
df_joined_gb['temp_25-30_total'] = df_joined_gb['temp_25-30']['bucket_total']
df_joined_gb['temp_25-30_hours'] = df_joined_gb['temp_25-30_total'] / 4
df_joined_gb['temp_25-30_%'] = df_joined_gb['temp_25-30_total'] / df_joined_gb['temp_c']['temp_count']



In [59]:
df_joined_gb.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,temp_0-5,temp_25-30,temp_<0,temp_55-60,temp_50-55,temp_40-45,humidity,humidity,humidity,humidity,...,temp_15-20,temp_30-35,temp_10-15,temp_45-50,temp_5-10,temp_60+,temp_20-25,temp_25-30_total,temp_25-30_hours,temp_25-30_%
Unnamed: 0_level_1,Unnamed: 1_level_1,bucket_total,bucket_total,bucket_total,bucket_total,bucket_total,bucket_total,humidity_avg,humidity_std_dev,humidity_max,humidity_count,...,bucket_total,bucket_total,bucket_total,bucket_total,bucket_total,bucket_total,bucket_total,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
nest_id,breeding_year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
101,2014,0,1957,0,0,0,9,80.867279,16.991467,109.050003,18349,...,7653,794,2865,0,66,0,4791,1957,489.25,0.106654
101,2015,0,0,0,0,0,0,89.656807,8.522756,107.029999,166,...,69,0,74,0,7,0,16,0,0.0,0.0
102,2014,0,1652,0,0,5,243,74.126831,18.788078,101.620003,18349,...,6439,1022,3543,38,273,0,4590,1652,413.0,0.090032
102,2015,0,0,0,0,0,0,86.626266,8.220601,101.620003,166,...,90,0,40,0,31,0,5,0,0.0,0.0
103,2015,0,416,0,0,0,0,80.052597,18.175327,109.910004,5415,...,2186,165,1212,0,71,0,1323,416,104.0,0.076824


In [44]:
df_joined_gb.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,temp_c,temp_c,temp_c,temp_c,temp_c,humidity,humidity,humidity,humidity,humidity,missing_temp_data,missing_data
Unnamed: 0_level_1,Unnamed: 1_level_1,temp_avg,temp_max,temp_count,temp_std_dev,temp_min,humidity_avg,humidity_std_dev,humidity_max,humidity_count,humidity_min,Unnamed: 12_level_1,Unnamed: 13_level_1
nest_id,breeding_year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
101,2014,19.827162,43.580002,18349,5.294471,8.59,80.867279,16.991467,109.050003,18349,26.030001,,
101,2015,15.136085,20.610001,166,3.505429,9.09,89.656807,8.522756,107.029999,166,69.730003,,
102,2014,20.213329,52.040001,18349,6.762604,6.56,74.126831,18.788078,101.620003,18349,11.45,,
102,2015,14.433976,21.08,166,3.70545,7.56,86.626266,8.220601,101.620003,166,70.199997,,
103,2015,18.632824,39.09,5415,5.192024,7.08,80.052597,18.175327,109.910004,5415,27.77,,


The below sends the data to the PostGres DB.

Currently considering not using the DB at all. While the data maniopulation within the DB via SQL is far easier, keeping the whole project (data load, manipulate, graph) to a single platform and language is a priority.

In [None]:
#sending temperature dataframe to the postgres DB
print("Transferring temperature dataframe to DB..")
df_temp.to_sql(con=engine, name='penguins_temperature', if_exists='replace')
print("Uploaded successfully")

#sending humidity dataframe to the postgres DB
print("Transferring humidity dataframe to DB..")
df_humd.to_sql(con=engine, name='penguins_humidity', if_exists='replace')
print("Uploaded successfully")

#sending nests dataframe to the postgres DB
print("Transferring nests dataframe to DB..")
nests_raw.to_sql(con=engine, name='penguins_nests', if_exists='replace')
print("Uploaded successfully")