# Data Preparation
The following need to be done:
* Load the relevant data sets from file
* Join them into a single data set
* Add additional computed features to the data
* Write the prepared data to file

### Set up the environment
We need a certain set of common libraries for the tasks to be performed. These are imported below. If an import statement errors, you will need to install the library in your environment using the command line command `pip install <library>`.

In [10]:
import pandas as pd
import os
import numpy as np
from datetime import date
import yaml
from sqlalchemy import create_engine

### Set up the variables
Change the values of the variables below to suit the files (names and directory location) to be loaded.

In [11]:
## Currently in unix format as docker containers run on debian
config_file = os.path.normpath('./config.yml')
temperature_file = os.path.normpath('./data/TempData_2_10_2016.txt')
humidity_file = os.path.normpath('./data/HumidData_2_10_2016.txt')

In [12]:
#initialise the config.yml file
with open(config_file, 'r') as ymlfile:
    config = yaml.load(ymlfile)

In [13]:
#set the database connection parameters based on the config.ini file
host = config['PostgreSQL']['host']
port = config['PostgreSQL']['port']
dbname = config['PostgreSQL']['dbname']
user  = config['PostgreSQL']['user']
password = config['PostgreSQL']['password']

In [14]:
#establish connection to the postgres database using the generated connection string
engine = create_engine(r"postgresql://"+user+":"+password+"@"+host+"/"+dbname)

## Load the temperature data
Read the temperature data file into memory and report on success/failure.

In [15]:
column_names = ['recnum', 'datetime', 'temp_c', 'nest_id']
data_types = {'recnum': np.int32, 
              'datetime': str, 
              'temp_c': np.float32, 
              'nest_id': str}
file_size = os.path.getsize(temperature_file)
print('\nLoading temperature file into memory.\nFile is {0:.1f} MB.'.format((file_size/1000000)))

if file_size > 5000000: # over 5mb
    print('Loading into memory. Please be patient. ', flush=True)
else:
    print('Loading into memory. ', flush=True, end='')
    
df_temp = pd.read_csv(temperature_file,
                     names=column_names,
                     usecols=[0,1,2,3],
                     dtype=data_types,
#                      nrows=2048,               # for testing only
                      parse_dates=['datetime'],
                      dayfirst=True,
                      encoding='utf-8',
                      error_bad_lines=False,
                      warn_bad_lines=True
                     )

if df_temp is not None:
    print('Success: loaded {0:,} records.'.format(len(df_temp)))
else:
    print('### FAILED! ###')


Loading temperature file into memory.
File is 88.2 MB.
Loading into memory. Please be patient. 
Success: loaded 2,169,903 records.


In [16]:
column_names = ['recnum', 'datetime', 'humidity', 'nest_id']
data_types = {'recnum': np.int32, 
              'datetime': str, 
              'humidity': np.float32, 
              'nest_id': str}
file_size = os.path.getsize(humidity_file)
print('\nLoading humidity file into memory.\nFile is {0:.1f} MB.'.format((file_size/1000000)))

if file_size > 5000000: # over 5mb
    print('Loading into memory. Please be patient. ', flush=True)
else:
    print('Loading into memory. ', flush=True, end='')

df_humd = pd.read_csv(humidity_file,
                     names=column_names,
                     usecols=[0,1,2,3],
                     dtype=data_types,
#                      nrows=2048,               # for testing only
                      parse_dates=['datetime'],
                      dayfirst=True,
                      encoding='utf-8',
                      error_bad_lines=False,
                      warn_bad_lines=True
                     )

if df_humd is not None:
    print('Success: loaded {0:,} records.'.format(len(df_humd)))
else:
    print('### FAILED! ###')


Loading humidity file into memory.
File is 93.2 MB.
Loading into memory. Please be patient. 
Success: loaded 2,173,732 records.


### ----------------------------------------------------------------
# Dev and Test
### ----------------------------------------------------------------

In [33]:
joined = pd.merge(left=df_temp,
                        right=df_humd,
                        how='outer',
                        on=['nest_id', 'datetime'], # both have same keys
                        left_on=None, # same key names: don't need to specify R and L
                        right_on=None, # same key names: don't need to specify R and L
                        left_index=False, # dont' use left df index as key
                        right_index=False, # dont' use right df index as key
                        sort=True, # for efficiency do/not sort the df first
                        suffixes=['_temp', '_humd']
                        )[['nest_id', 'datetime', 'temp_c', 'humidity']] # take only these cols

print('Records in temperature data: {0:>20,}'.format(len(df_temp)))
print('Records in humidity data:    {0:>20,}'.format(len(df_humd)))
print('                              -------------------')
print('Records in joined data:      {0:>20,}'.format(len(joined)))
print('\nOverview:')
"""
Have to add a function to the end of a groupby statement otherwise it becomes an un-uploadable pandas 'groupby object', 
instead of a normal dataframe. Documentation is sparse but my theory is it because of python's lack of types. So it's like
you're grouping numeric or float fields in SQL without specifying the grouping method (e.g. count()).
Except in pandas if you forget to do that it just gives you a 'shitty version' dataframe called a 'groupby object' as punishment.
"""
nests_raw = joined.groupby(['nest_id', 'datetime', 'temp_c', 'humidity']).count() 
print('Number of nest_ids:          {0:>20,}'.format(len(gb)))
#nests = pd.DataFrame({'count' : joined.groupby( ['nest_id'] ).size()}).reset_index()
nests_raw.head(10)
# joined

Records in temperature data:            2,169,903
Records in humidity data:               2,173,732
                              -------------------
Records in joined data:                 2,173,738

Overview:
Number of nest_ids:                           140


nest_id,datetime,temp_c,humidity
101,2013-07-11 21:49:00,15.1,91.949997
101,2013-07-11 22:04:00,15.1,91.949997
101,2013-07-11 22:19:00,15.61,91.519997
101,2013-07-11 22:34:00,15.61,91.519997
101,2013-07-11 22:49:00,15.61,91.519997
101,2013-07-11 23:04:00,15.61,91.080002
101,2013-07-11 23:19:00,15.61,91.080002
101,2013-07-11 23:34:00,15.61,91.080002
101,2013-07-11 23:49:00,15.61,91.080002
101,2013-07-12 00:04:00,15.1,91.080002


In [None]:
#sending temperature dataframe to the postgres DB
print("Transferring temperature dataframe to DB..")
df_temp.to_sql(con=engine, name='penguins_temperature', if_exists='replace')
print("Uploaded successfully")

#sending humidity dataframe to the postgres DB
print("Transferring humidity dataframe to DB..")
df_humd.to_sql(con=engine, name='penguins_humidity', if_exists='replace')
print("Uploaded successfully")

#sending nests dataframe to the postgres DB
print("Transferring nests dataframe to DB..")
nests_raw.to_sql(con=engine, name='penguins_nests', if_exists='replace')
print("Uploaded successfully")

Transferring nests dataframe to DB..
