# Create train, test and validation dataframes

## No normalization


In [63]:
import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import os.path

In [64]:
# This can be varied to point to different files.
# However, different files have different columns.

# sample data for easy x+y=z
# JUST z=10
#CSV_FILE_NAME = "xyz10.csv"

# sample data for easy x+y=z
CSV_FILE_NAME = "xyz.csv"


os.environ['CSV_FILE_NAME'] = CSV_FILE_NAME

In [65]:
# The current directory will be where this src file is located.
# Which is in the src dir of the project
dirpath = os.getcwd()
print("current directory is : " + dirpath)

current directory is : /Users/davis/progs/Keras-Philosophy/src


In [66]:
root_path = os.path.dirname(dirpath)
print("parent directory is : " + root_path)

parent directory is : /Users/davis/progs/Keras-Philosophy


In [67]:
data_path = root_path + "/data/"
print("data direcotry is: " + data_path)

data direcotry is: /Users/davis/progs/Keras-Philosophy/data/


In [68]:
fqfn = data_path + CSV_FILE_NAME
print("fully qualified csv file name: ", fqfn)

fully qualified csv file name:  /Users/davis/progs/Keras-Philosophy/data/xyz.csv


In [49]:
logs_path = os.path.join(root_path,"logs")
print("logs directory is : " + logs_path)

logs directory is : /Users/davis/progs/Keras-Philosophy/logs


In [50]:
LOG_DIR_NAME = logs_path
os.environ['LOG_DIR_NAME'] = LOG_DIR_NAME

In [51]:
%%bash
echo ${LOG_DIR_NAME}

/Users/davis/progs/Keras-Philosophy/logs


In [52]:
%%bash
head "../data/${CSV_FILE_NAME}"

z,x,y
10.0,10.0,0.0
10.0,9.0,1.0
10.0,8.0,2.0
10.0,7.0,3.0
10.0,6.0,4.0
10.0,5.0,5.0
10.0,4.0,6.0
10.0,3.0,7.0
10.0,2.0,8.0


In [53]:
%%bash
# this csv file has 9875 lines corresponding to records (plus one header row)
wc -l "../data/${CSV_FILE_NAME}"

      53 ../data/xyz.csv


# Use generic python to read data into pandas dataframe

In [54]:
raw_dataset = pd.read_csv(fqfn, 
                          header=0,
                          sep=",")


# previously this was a relative dir name
#raw_dataset = pd.read_csv(CSV_FILE_NAME, 
#                          header=0,
#                          sep=",")

dataset = raw_dataset.copy()

In [55]:
dataset.shape
#dataset.head()

(50, 3)

In [56]:
train_ds = dataset.sample(frac=0.7,random_state=3367)
# what's leftover will be split into test and train evenly
leftover_ds = dataset.drop(train_ds.index) 
                       
test_ds = leftover_ds.sample(frac=0.5, random_state=3367)
valid_ds = leftover_ds.drop(test_ds.index)

In [57]:
train_ds

Unnamed: 0,z,x,y
14,11.0,8.0,3.0
23,12.0,12.0,0.0
46,4.0,3.0,1.0
6,10.0,4.0,6.0
34,12.0,1.0,11.0
27,12.0,8.0,4.0
36,1.0,1.0,0.0
47,4.0,2.0,2.0
8,10.0,2.0,8.0
2,10.0,8.0,2.0


In [58]:
test_ds

Unnamed: 0,z,x,y
38,2.0,2.0,0.0
40,2.0,0.0,2.0
48,4.0,1.0,3.0
5,10.0,5.0,5.0
21,11.0,1.0,10.0
15,11.0,7.0,4.0
9,10.0,1.0,9.0
17,11.0,5.0,6.0


In [59]:
valid_ds

Unnamed: 0,z,x,y
12,11.0,10.0,1.0
18,11.0,4.0,7.0
20,11.0,2.0,9.0
25,12.0,10.0,2.0
31,12.0,4.0,8.0
41,3.0,3.0,0.0
43,3.0,1.0,2.0


In [60]:
leftover_ds

Unnamed: 0,z,x,y
5,10.0,5.0,5.0
9,10.0,1.0,9.0
12,11.0,10.0,1.0
15,11.0,7.0,4.0
17,11.0,5.0,6.0
18,11.0,4.0,7.0
20,11.0,2.0,9.0
21,11.0,1.0,10.0
25,12.0,10.0,2.0
31,12.0,4.0,8.0


# write the normalized datasets to disk

In [61]:
train_ds.to_csv(data_path + "/" + "train.csv", encoding='utf-8', index=False)
test_ds.to_csv(data_path + "/" + "test.csv", encoding="utf-8", index=False)
valid_ds.to_csv(data_path + "/" + "valid.csv", encoding='utf-8', index=False)


In [62]:
# typically only use the train dataset to normalize the data
train_stats_df = train_ds
train_stats = train_stats_df.describe().transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
z,35.0,8.971429,3.761146,1.0,7.0,10.0,12.0,12.0
x,35.0,4.657143,3.756898,0.0,1.5,4.0,8.0,12.0
y,35.0,4.314286,3.700329,0.0,1.0,3.0,7.0,12.0
