# Cleans previously massaged data into separate csv files
This code uses the original public USGS and GreenStream data which was previously combined and pitch matched in terms of timings apache beam.  During that time the original data was also averaged from hourly reports into daily averages.

The data was averaged so that the model is generalized and not overly precise with outliers due to possible sensor inaccuracy. This is accpetable because we trying to virtualize water levels and due to terrain and flows there is no need to be overly precise.  Also this enables smaller dataset which is more suitable for faster model training.

In [40]:
# leave until I can determine if tensorboard runs in ibm cloud
# %load_ext tensorboard.notebook

In [106]:
import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import os.path

In [107]:
# This can be varied to point to different files.
# However, different files have different columns.

# sample data for easy x+y=z
# JUST z=10
CSV_FILE_NAME = "../data/xyz10.csv"

# sample data for easy x+y=z
#CSV_FILE_NAME = "../data/xyz.csv"


os.environ['CSV_FILE_NAME'] = CSV_FILE_NAME

In [108]:
# The current directory will be where this src file is located.
# Which is in the src dir of the project
dirpath = os.getcwd()
print("current directory is : " + dirpath)

current directory is : /Users/davis/progs/github/Keras-Philosophy/src


In [109]:
root_path = os.path.dirname(dirpath)
print("parent directory is : " + root_path)

parent directory is : /Users/davis/progs/github/Keras-Philosophy


In [110]:
data_path = root_path + "/data"
print("data direcotry is: " + data_path)

data direcotry is: /Users/davis/progs/github/Keras-Philosophy/data


In [111]:
logs_path = os.path.join(root_path,"logs")
print("logs directory is : " + logs_path)

logs directory is : /Users/davis/progs/github/Keras-Philosophy/logs


In [112]:
LOG_DIR_NAME = logs_path
os.environ['LOG_DIR_NAME'] = LOG_DIR_NAME

In [113]:
%%bash
echo ${LOG_DIR_NAME}

/Users/davis/progs/github/Keras-Philosophy/logs


In [114]:
%%bash
head ${CSV_FILE_NAME}

z,x,y
10.0,10.0,0.0
10.0,9.0,1.0
10.0,8.0,2.0
10.0,7.0,3.0
10.0,6.0,4.0
10.0,5.0,5.0
10.0,4.0,6.0
10.0,3.0,7.0
10.0,2.0,8.0


In [115]:
%%bash
# this csv file has 9875 lines corresponding to records (plus one header row)
wc -l ${CSV_FILE_NAME}

      14 ../data/xyz10.csv


# Use generic python to read data into pandas dataframe

## In case of nans, 
In order to add default values for NA data, use the converters helper functions

## In case of missing data, 
In order to add default values for missing data use average value for column/feature average.  Averages were calculated seperately.

In [116]:
raw_dataset = pd.read_csv(CSV_FILE_NAME, 
                          header=0,
                          sep=",")

dataset = raw_dataset.copy()

In [117]:
dataset.shape
#dataset.head()

(11, 3)

In [118]:
train_ds = dataset.sample(frac=0.7,random_state=3367)
# what's leftover will be split into test and train evenly
leftover_ds = dataset.drop(train_ds.index) 
                       
test_ds = leftover_ds.sample(frac=0.5, random_state=3367)
valid_ds = leftover_ds.drop(test_ds.index)

In [119]:
train_ds

Unnamed: 0,z,x,y
3,10.0,7.0,3.0
8,10.0,2.0,8.0
7,10.0,3.0,7.0
4,10.0,6.0,4.0
0,10.0,10.0,0.0
10,10.0,0.0,10.0
1,10.0,9.0,1.0
5,10.0,5.0,5.0


In [120]:
test_ds

Unnamed: 0,z,x,y
2,10.0,8.0,2.0
6,10.0,4.0,6.0


In [121]:
valid_ds

Unnamed: 0,z,x,y
9,10.0,1.0,9.0


In [122]:
leftover_ds

Unnamed: 0,z,x,y
2,10.0,8.0,2.0
6,10.0,4.0,6.0
9,10.0,1.0,9.0


# Normalize the data

In [123]:
# use only the training data to get mean and std 
# which we can use to norm all datasets
train_stats_df = train_ds.copy()
train_stats = train_stats_df.describe().transpose()


In [124]:
# this can introduce nans.  In the case where all z values are 
# the same, the STD will be zero.  In this case, replace nan with 1,
# corresponding to the max value. ie. 10 will 1.

def norm(a_df):
    # Do the norm operation
    normed_df = abs(a_df - train_stats['mean']) / train_stats['std']
    return normed_df

normed_train_ds = norm(train_ds)
normed_test_ds = norm(test_ds)
normed_valid_ds = norm(valid_ds)

In [125]:
# Create a function to look for nans in the dataframe
# call via nans(dataset)
nans = lambda df: df[df.isnull().any(axis=1)]

In [126]:
#
# You can do either of these mechanisms to get the nan_rows. 
#


# use the lambda and suffix on the one column which will make them
nan_rows = nans(normed_train_ds)['z']

# or use procedure directly and not limit output to the particular column
#nan_rows = normed_train_ds[normed_train_ds.isnull().T.any()]

In [127]:
nan_rows

3    NaN
8    NaN
7    NaN
4    NaN
0    NaN
10   NaN
1    NaN
5    NaN
Name: z, dtype: float64

In [128]:
# replace the nans with max value for the column
#the_max = train_stats['mean']['z']
#the_max
# err, replace with 1
normed_train_ds.fillna(1.0, inplace=True)
normed_test_ds.fillna(1.0, inplace=True)
normed_valid_ds.fillna(1.0, inplace=True)

#normed_train_ds
#normed_test_ds
#normed_valid_ds

In [130]:
#normed_train_ds
#normed_valid_ds.head()
normed_test_ds.head()

Unnamed: 0,z,x,y
2,1.0,0.79623,0.79623
6,1.0,0.361923,0.361923


# write the normalized datasets to disk

In [131]:
normed_train_ds.to_csv(data_path + "/" + "train.csv", encoding='utf-8', index=False)
normed_test_ds.to_csv(data_path + "/" + "test.csv", encoding="utf-8", index=False)
normed_valid_ds.to_csv(data_path + "/" + "valid.csv", encoding='utf-8', index=False)


In [132]:
print(data_path + "/" + "valid.csv")

/Users/davis/progs/github/Keras-Philosophy/data/valid.csv


In [133]:
train_ds.describe()

Unnamed: 0,z,x,y
count,8.0,8.0,8.0
mean,10.0,5.25,4.75
std,0.0,3.453776,3.453776
min,10.0,0.0,0.0
25%,10.0,2.75,2.5
50%,10.0,5.5,4.5
75%,10.0,7.5,7.25
max,10.0,10.0,10.0


In [134]:
normed_train_ds.describe()

Unnamed: 0,z,x,y
count,8.0,8.0,8.0
mean,1.0,0.79623,0.79623
std,0.0,0.524832,0.524832
min,1.0,0.072385,0.072385
25%,1.0,0.434307,0.434307
50%,1.0,0.79623,0.79623
75%,1.0,1.158153,1.158153
max,1.0,1.520075,1.520075


In [135]:
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
z,8.0,10.0,0.0,10.0,10.0,10.0,10.0,10.0
x,8.0,5.25,3.453776,0.0,2.75,5.5,7.5,10.0
y,8.0,4.75,3.453776,0.0,2.5,4.5,7.25,10.0
