# Create train, test and validation dataframes

## No normalization

## Augmentation added


In [3]:
import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import os.path
import numpy as np

# Input File Selection and Directory Setup

In [4]:
# Input file can be varied to point to different files.
# However, this model assumes these columns:
#  Z (Target) 
#  X (Feature 1 or X_1)
#  Y (Feature 1 or X_2)
#  ... (Feature N)
#
# For this demo, it was three columns where:
#    where z = x + y

# sample data for easy x+y=z
# JUST z=10
CSV_FILE_NAME = "xyz10.csv"

# sample data for easy x+y=z
#CSV_FILE_NAME = "xyz.csv"

os.environ['CSV_FILE_NAME'] = CSV_FILE_NAME

In [5]:
# The current directory will be where this src file is located.
# Which is in the src dir of the project
dirpath = os.getcwd()
print("current directory is : " + dirpath)

current directory is : /Users/davis/progs/github/Keras-Philosophy/src


In [6]:
root_path = os.path.dirname(dirpath)
print("parent directory is : " + root_path)

parent directory is : /Users/davis/progs/github/Keras-Philosophy


In [7]:
data_path = root_path + "/data/"
print("data direcotry is: " + data_path)

data direcotry is: /Users/davis/progs/github/Keras-Philosophy/data/


In [18]:
log_path = root_path + "/logs/"
# logs needs to be used by the shell commands also
os.environ['LOG_DIR_NAME'] = log_path

print("log direcotry is: " + log_path)

log direcotry is: /Users/davis/progs/github/Keras-Philosophy/logs/


In [24]:
fqfn = data_path + CSV_FILE_NAME
print("fully qualified csv file name: ", fqfn)
# For debug, save Fully qualifed input file
# logs needs to be used by the shell commands also
os.environ['CSV_FQFN'] = fqfn


fully qualified csv file name:  /Users/davis/progs/github/Keras-Philosophy/data/xyz10.csv


In [25]:
%%bash
echo "CSV_FILE_NAME: ${CSV_FILE_NAME}"
echo "CSV_FQFN: ${CSV_FQFN}"
echo "LOG_DIR_NAME: ${LOG_DIR_NAME}"

CSV_FILE_NAME: xyz10.csv
CSV_FQFN: /Users/davis/progs/github/Keras-Philosophy/data/xyz10.csv
LOG_DIR_NAME: /Users/davis/progs/github/Keras-Philosophy/logs/


In [27]:
%%bash
# show first five lines
head -5 "${CSV_FQFN}"
# determine how many lines are in the file including the header row
wc -l "${CSV_FQFN}"

z,x,y
10.0,10.0,0.0
10.0,9.0,1.0
10.0,8.0,2.0
10.0,7.0,3.0


      14 /Users/davis/progs/github/Keras-Philosophy/data/xyz10.csv


# Use pandas to read csv into dataframe

In [28]:
raw_dataframe = pd.read_csv(fqfn, 
                          header=0,
                          sep=",")

dataframe = raw_dataframe.copy()

In [29]:
dataframe.shape
dataframe.head()

Unnamed: 0,z,x,y
0,10.0,10.0,0.0
1,10.0,9.0,1.0
2,10.0,8.0,2.0
3,10.0,7.0,3.0
4,10.0,6.0,4.0


# Augment the data

Increase the size of the existing dataset by creating a new copy.  In the copy add tiny amount to x column and then subtract the same amount to the y column so that the sum remains constant.  Dupe this process 100 times.

In [30]:
# create a routine to augment an existing df
def create_aug_df(a_df): 

    copy_df = a_df.copy()

    # drop rows with zero in X
    copy_df = copy_df[copy_df.x != 0]

    # drop rows with zero in y
    copy_df = copy_df[copy_df.y != 0]

    # so we don't introduce negative numbers, get the minimum and set that 
    # as the floor. 
    min_val = copy_df.min(axis=None)
    rnd_val = np.random.rand()
    mod_val = np.minimum(min_val, rnd_val)
    #print("min_value: ",min_val)
    #print("mod_value: ", mod_val)

    # adj columns x and y by mod amount
    copy_df['x'] = copy_df['x'] + mod_val
    copy_df['y'] = copy_df['y'] - mod_val

    return copy_df


In [31]:
# use the function to create a larger augmented dataframe
df_list = []
for x in range(0,100):
    df_list.append(create_aug_df(dataframe))

aug_df = pd.concat(df_list)
#print("augmented df shape: ", aug_df.shape)
#print("aug_df head: ")
#print(aug_df.head())

# Create Train, Test and Validation DataFrames

In [32]:
# Since we used concat, our index is screwed uup
# create a new index and apply it
num_rows = aug_df.shape[0]
index = pd.Index(range(0,num_rows))
aug_df.set_index(index,inplace=True)


In [33]:



# sample 70% of aumented data to the train_df
train_df = aug_df.sample(frac=0.7,random_state=3367)
# what's leftover will be split into test and train evenly
leftover_df = aug_df.drop(train_df.index) 
# split what's leftover into test and validation                       
test_df = leftover_df.sample(frac=0.5, random_state=3367)
valid_df = leftover_df.drop(test_df.index)
# print sizes as check
print("train shape: ", train_df.shape)
print("test shape: ", test_df.shape)
print("valid shape: ", valid_df.shape)



train shape:  (630, 3)
test shape:  (135, 3)
valid shape:  (135, 3)


# write the dataframes to disk

In [34]:
train_df.to_csv(data_path + "/" + "train.csv", encoding='utf-8', index=False)
test_df.to_csv(data_path + "/" + "test.csv", encoding="utf-8", index=False)
valid_df.to_csv(data_path + "/" + "valid.csv", encoding='utf-8', index=False)


In [35]:
# typically only use the train dataframe to normalize the data
train_stats_df = train_df
train_stats = train_stats_df.describe().transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
z,630.0,10.0,0.0,10.0,10.0,10.0,10.0,10.0
x,630.0,5.512188,2.614739,1.021156,3.19832,5.527674,7.839908,9.990275
y,630.0,4.487812,2.614739,0.009725,2.160092,4.472326,6.80168,8.978844
