# Create train, test and validation dataframes

## No normalization

## Augmentation added


In [1]:
import datetime
#from pathlib import Path
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import os.path
import numpy as np

# Input File Selection and Directory Setup

In [2]:
# Input file can be varied to point to different files.
# However, this model assumes these columns:
#  Z (Target) 
#  X (Feature 1 or X_1)
#  Y (Feature 1 or X_2)
#  ... (Feature N)
#
# For this demo, it was three columns where:
#    where z = x + y

# sample data for easy x+y=z
# JUST z=10
#CSV_FILE_NAME = "xyz10.csv"

# sample data for easy x+y=z
CSV_FILE_NAME = "xyz.csv"

os.environ['CSV_FILE_NAME'] = CSV_FILE_NAME

In [3]:
# The current directory will be where this src file is located.
# Which is in the src dir of the project
dirpath = os.getcwd()
print("current directory is : " + dirpath)

current directory is : /workspaces/Keras-Philosophy/notebooks


In [4]:
# Use pathlib to find the root dir of the git repo
root_path = pathlib.PurePath(dirpath).parents[0]
data_path = root_path / 'data'
logs_path = root_path / 'logs'
print("root directory is: ", root_path)
print("data directory is: ",  data_path)
print("logs directory is: ", logs_path)

root directory is:  /workspaces/Keras-Philosophy
data directory is:  /workspaces/Keras-Philosophy/data
logs directory is:  /workspaces/Keras-Philosophy/logs


In [5]:
# Create equivalent dir names in the environment
# Logs
LOGS_DIR_NAME = logs_path.as_posix()
print("LOGS_DIR_NAME: ", LOGS_DIR_NAME)
os.environ['LOGS_DIR_NAME'] = LOGS_DIR_NAME
# Data
DATA_DIR_NAME = data_path.as_posix()
print("DATA_DIR_NAME: ", DATA_DIR_NAME)
os.environ['DATA_DIR_NAME'] = DATA_DIR_NAME

LOGS_DIR_NAME:  /workspaces/Keras-Philosophy/logs
DATA_DIR_NAME:  /workspaces/Keras-Philosophy/data


In [6]:
%%bash
head "${DATA_DIR_NAME}/${CSV_FILE_NAME}" -n 5

z,x,y
10.0,10.0,0.0
10.0,9.0,1.0
10.0,8.0,2.0
10.0,7.0,3.0


# Use pandas to read csv into dataframe

In [7]:
raw_dataframe = pd.read_csv(data_path / CSV_FILE_NAME, 
                          header=0,
                          sep=",")

dataframe = raw_dataframe.copy()

In [8]:
dataframe.shape
dataframe.head()

Unnamed: 0,z,x,y
0,10.0,10.0,0.0
1,10.0,9.0,1.0
2,10.0,8.0,2.0
3,10.0,7.0,3.0
4,10.0,6.0,4.0


# Augment the data

Increase the size of the existing dataset by creating a new copy.  In the copy add tiny amount to x column and then subtract the same amount to the y column so that the sum remains constant.  Dupe this process 100 times.

In [9]:
# create a routine to augment an existing df
def create_aug_df(a_df): 

    copy_df = a_df.copy()

    # drop rows with zero in X
    copy_df = copy_df[copy_df.x != 0]

    # drop rows with zero in y
    copy_df = copy_df[copy_df.y != 0]

    # so we don't introduce negative numbers, get the minimum and set that 
    # as the floor. 
    min_val = copy_df.min(axis=None)
    rnd_val = np.random.rand()
    mod_val = np.minimum(min_val, rnd_val)
    #print("min_value: ",min_val)
    #print("mod_value: ", mod_val)

    # adj columns x and y by mod amount
    copy_df['x'] = copy_df['x'] + mod_val
    copy_df['y'] = copy_df['y'] - mod_val

    return copy_df


In [10]:
# use the function to create a larger augmented dataframe
df_list = []
for x in range(0,100):
    df_list.append(create_aug_df(dataframe))

aug_df = pd.concat(df_list)
#print("augmented df shape: ", aug_df.shape)
#print("aug_df head: ")
#print(aug_df.head())

# Create Train, Test and Validation DataFrames

In [11]:
# Since we used concat, our index is screwed uup
# create a new index and apply it
num_rows = aug_df.shape[0]
index = pd.Index(range(0,num_rows))
aug_df.set_index(index,inplace=True)


In [12]:



# sample 70% of aumented data to the train_df
train_df = aug_df.sample(frac=0.7,random_state=3367)

# what's leftover will be split into test and train evenly
leftover_df = aug_df.drop(train_df.index) 

# split what's leftover into test and validation                       
test_df = leftover_df.sample(frac=0.5, random_state=3367)
valid_df = leftover_df.drop(test_df.index)

# print sizes as check
print("train shape: ", train_df.shape)
print("test shape: ", test_df.shape)
print("valid shape: ", valid_df.shape)



train shape:  (2520, 3)
test shape:  (540, 3)
valid shape:  (540, 3)


# write the dataframes to disk

In [13]:
train_df.to_csv(data_path /  pathlib.PurePath("train.csv"), encoding='utf-8', index=False)
valid_df.to_csv(data_path /  pathlib.PurePath("valid.csv"), encoding='utf-8', index=False)
test_df.to_csv(data_path /  pathlib.PurePath("test.csv"), encoding="utf-8", index=False)


In [14]:
# typically only use the train dataframe to normalize the data
train_stats_df = train_df
train_stats = train_stats_df.describe().transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
z,2520.0,9.802778,2.965662,2.0,10.0,11.0,12.0,12.0
x,2520.0,5.39274,3.067324,1.001507,2.618937,5.022817,7.970359,11.970359
y,2520.0,4.410038,3.074886,0.013241,1.601348,3.998493,7.066525,10.998493


In [15]:
train_df

Unnamed: 0,z,x,y
753,4.0,3.871479,0.128521
1902,2.0,1.052270,0.947730
335,11.0,8.067550,2.932450
1097,11.0,2.294038,8.705962
1055,11.0,8.354096,2.645904
...,...,...,...
2761,12.0,5.458001,6.541999
1531,12.0,11.575478,0.424522
2788,11.0,3.503967,7.496033
1025,11.0,2.913985,8.086015
