# Data Preparation
The following need to be done:
* Load the relevant data sets from file
* Join them into a single data set
* Add additional computed features to the data
* Write the prepared data to file

### Set up the environment
We need a certain set of common libraries for the tasks to be performed. These are imported below. If an import statement errors, you will need to install the library in your environment using the command line command `pip install <library>`.

In [66]:
import pandas as pd
import os
import numpy as np
from datetime import date

### Set up the variables
Change the values of the variables below to suit the files (names and directory location) to be loaded.

In [67]:
temperature_file = os.path.normpath('.\data\TempData_2_10_2016.txt')
humidity_file = os.path.normpath('.\data\HumidData_2_10_2016.txt')

## Load the temperature data
Read the temperature data file into memory and report on success/failure.

In [68]:
column_names = ['recnum', 'datetime', 'temp_c', 'nest_id']
data_types = {'recnum': np.int32, 
              'datetime': str, 
              'temp_c': np.float32, 
              'nest_id': str}
file_size = os.path.getsize(temperature_file)
print('\nLoading temperature file into memory.\nFile is {0:.1f} MB.'.format((file_size/1000000)))

if file_size > 5000000: # over 5mb
    print('Loading into memory. Please be patient. ', flush=True)
else:
    print('Loading into memory. ', flush=True, end='')
    
df_temp = pd.read_csv(temperature_file,
                     names=column_names,
                     usecols=[0,1,2,3],
                     dtype=data_types,
#                      nrows=2048,               # for testing only
                      parse_dates=['datetime'],
                      dayfirst=True,
                      encoding='utf-8',
                      error_bad_lines=False,
                      warn_bad_lines=True
                     )

if df_temp is not None:
    print('Success: loaded {0:,} records.'.format(len(df_temp)))
else:
    print('### FAILED! ###')


Loading temperature file into memory.
File is 88.2 MB.
Loading into memory. Please be patient. 
Success: loaded 2,169,903 records.


In [69]:
column_names = ['recnum', 'datetime', 'humidity', 'nest_id']
data_types = {'recnum': np.int32, 
              'datetime': str, 
              'humidity': np.float32, 
              'nest_id': str}
file_size = os.path.getsize(humidity_file)
print('\nLoading humidity file into memory.\nFile is {0:.1f} MB.'.format((file_size/1000000)))

if file_size > 5000000: # over 5mb
    print('Loading into memory. Please be patient. ', flush=True)
else:
    print('Loading into memory. ', flush=True, end='')

df_humd = pd.read_csv(humidity_file,
                     names=column_names,
                     usecols=[0,1,2,3],
                     dtype=data_types,
#                      nrows=2048,               # for testing only
                      parse_dates=['datetime'],
                      dayfirst=True,
                      encoding='utf-8',
                      error_bad_lines=False,
                      warn_bad_lines=True
                     )

if df_humd is not None:
    print('Success: loaded {0:,} records.'.format(len(df_humd)))
else:
    print('### FAILED! ###')


Loading humidity file into memory.
File is 93.2 MB.
Loading into memory. Please be patient. 
Success: loaded 2,173,732 records.


### ----------------------------------------------------------------
# Dev and Test
### ----------------------------------------------------------------

In [70]:
joined = pd.merge(left=df_temp,
                        right=df_humd,
                        how='outer',
                        on=['nest_id', 'datetime'], # both have same keys
                        left_on=None, # same key names: don't need to specify R and L
                        right_on=None, # same key names: don't need to specify R and L
                        left_index=False, # dont' use left df index as key
                        right_index=False, # dont' use right df index as key
                        sort=True, # for efficiency do/not sort the df first
                        suffixes=['_temp', '_humd']
                        )[['nest_id', 'datetime', 'temp_c', 'humidity']] # take only these cols

print('Records in temperature data: {0:>20,}'.format(len(df_temp)))
print('Records in humidity data:    {0:>20,}'.format(len(df_humd)))
print('                              -------------------')
print('Records in joined data:      {0:>20,}'.format(len(joined)))
print('\nOverview:')
gb = joined.groupby(['nest_id'])
print('Number of nest_ids:          {0:>20,}'.format(len(gb)))
gb.agg(['count'])

# joined

Records in temperature data:            2,169,903
Records in humidity data:               2,173,732
                              -------------------
Records in joined data:                 2,173,738

Overview:
Number of nest_ids:                           140


Unnamed: 0_level_0,datetime,temp_c,humidity
Unnamed: 0_level_1,count,count,count
nest_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
101,18515,18515,18515
102,18515,18515,18515
103,17542,17542,17542
108,38444,38444,38444
10A,47659,47659,47659
10a,3073,3073,3073
110,17395,17395,17395
111,17494,17494,17494
117,20023,20023,20023
121,18514,18514,18514
