In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
data_filename = 'sample_data.csv'
csv_path = '../{filename}'.format(filename=data_filename)

train_file_name = "train_out"
predict_file_name = "predict_out"

In [3]:
# load the csv
df = pd.read_csv(csv_path, index_col=0)
df = df.dropna()
display(df.head())

Unnamed: 0,a,b,class
0,2.747052,1.462052,1.0
1,1.588197,2.962291,0.0
2,3.120492,1.254817,1.0
3,2.201141,4.368643,0.0
4,-0.066859,3.446709,0.0


In [4]:
feature_columns = ['a','b']

In [5]:
training_data_cuffoff = (len(df.index) // 10) * 8
print(training_data_cuffoff)

1600


In [6]:
# cutt off data at training end, create training data df
train_df = df[df.index<training_data_cuffoff].copy()
display(train_df.index.max())
display(train_df.index.min())

1599

0

In [7]:
# create output labels based on class column
train_labels_df = pd.get_dummies(train_df['class'].astype(int)).add_prefix("y_")
# save label columns for output 
y_columns = train_labels_df.columns.values

train_df = pd.concat([train_df, train_labels_df],axis=1)
display(train_df.head())
display(train_df.tail())

Unnamed: 0,a,b,class,y_0,y_1
0,2.747052,1.462052,1.0,0,1
1,1.588197,2.962291,0.0,1,0
2,3.120492,1.254817,1.0,0,1
3,2.201141,4.368643,0.0,1,0
4,-0.066859,3.446709,0.0,1,0


Unnamed: 0,a,b,class,y_0,y_1
1595,2.874107,1.03816,1.0,0,1
1596,-0.208393,1.272684,1.0,0,1
1597,0.725376,0.542169,1.0,0,1
1598,3.881996,-0.558501,1.0,0,1
1599,1.32805,4.683023,0.0,1,0


In [8]:
# create predict df
predict_df = df[df.index>=training_data_cuffoff].copy()
display(predict_df.index.max())
display(predict_df.index.min())

1999

1600

In [9]:
# create output labels based on class column
predict_labels_df = pd.get_dummies(predict_df['class'].astype(int)).add_prefix("y_") 

predict_df = pd.concat([predict_df, predict_labels_df],axis=1)
display(predict_df.head())
display(predict_df.tail())

Unnamed: 0,a,b,class,y_0,y_1
1600,2.052303,-0.87938,1.0,0,1
1601,1.645049,2.279846,1.0,0,1
1602,0.626488,6.327259,0.0,1,0
1603,1.667809,4.998536,0.0,1,0
1604,1.85566,4.939212,0.0,1,0


Unnamed: 0,a,b,class,y_0,y_1
1995,0.158777,2.90244,0.0,1,0
1996,1.268235,0.312893,1.0,0,1
1997,0.813015,4.217332,0.0,1,0
1998,0.687265,2.305148,0.0,1,0
1999,-0.662154,2.527399,0.0,1,0


In [10]:
all_df = pd.concat([train_df, predict_df])
display(all_df.head())
display(all_df.tail())

Unnamed: 0,a,b,class,y_0,y_1
0,2.747052,1.462052,1.0,0,1
1,1.588197,2.962291,0.0,1,0
2,3.120492,1.254817,1.0,0,1
3,2.201141,4.368643,0.0,1,0
4,-0.066859,3.446709,0.0,1,0


Unnamed: 0,a,b,class,y_0,y_1
1995,0.158777,2.90244,0.0,1,0
1996,1.268235,0.312893,1.0,0,1
1997,0.813015,4.217332,0.0,1,0
1998,0.687265,2.305148,0.0,1,0
1999,-0.662154,2.527399,0.0,1,0


In [11]:
# create feature (x_n) and label (y_n) columns
x_columns = []

all_df = all_df.dropna().reset_index()
for i, f in enumerate(feature_columns):
    all_df['x_'+str(i)] = all_df[f]
    x_columns.append('x_'+str(i))


output_columns = np.concatenate([x_columns,y_columns])

In [12]:
train_df = all_df[all_df.index<training_data_cuffoff].copy().reset_index(drop=True)
display(train_df.head())
display(train_df.tail())

train_df_out = train_df[output_columns]
display(train_df_out.head())
display(train_df_out.tail())

Unnamed: 0,index,a,b,class,y_0,y_1,x_0,x_1
0,0,2.747052,1.462052,1.0,0,1,2.747052,1.462052
1,1,1.588197,2.962291,0.0,1,0,1.588197,2.962291
2,2,3.120492,1.254817,1.0,0,1,3.120492,1.254817
3,3,2.201141,4.368643,0.0,1,0,2.201141,4.368643
4,4,-0.066859,3.446709,0.0,1,0,-0.066859,3.446709


Unnamed: 0,index,a,b,class,y_0,y_1,x_0,x_1
1595,1595,2.874107,1.03816,1.0,0,1,2.874107,1.03816
1596,1596,-0.208393,1.272684,1.0,0,1,-0.208393,1.272684
1597,1597,0.725376,0.542169,1.0,0,1,0.725376,0.542169
1598,1598,3.881996,-0.558501,1.0,0,1,3.881996,-0.558501
1599,1599,1.32805,4.683023,0.0,1,0,1.32805,4.683023


Unnamed: 0,x_0,x_1,y_0,y_1
0,2.747052,1.462052,0,1
1,1.588197,2.962291,1,0
2,3.120492,1.254817,0,1
3,2.201141,4.368643,1,0
4,-0.066859,3.446709,1,0


Unnamed: 0,x_0,x_1,y_0,y_1
1595,2.874107,1.03816,0,1
1596,-0.208393,1.272684,0,1
1597,0.725376,0.542169,0,1
1598,3.881996,-0.558501,0,1
1599,1.32805,4.683023,1,0


In [13]:
train_df_out.to_csv(train_file_name+".csv")
train_df.to_csv(train_file_name+"_all.csv")

In [14]:
# 
predict_df = all_df[all_df.index>=training_data_cuffoff].copy().reset_index(drop=True)
display(predict_df.head())
display(predict_df.tail())

predict_df_out = predict_df[output_columns]
display(predict_df_out.head())
display(predict_df_out.tail())

Unnamed: 0,index,a,b,class,y_0,y_1,x_0,x_1
0,1600,2.052303,-0.87938,1.0,0,1,2.052303,-0.87938
1,1601,1.645049,2.279846,1.0,0,1,1.645049,2.279846
2,1602,0.626488,6.327259,0.0,1,0,0.626488,6.327259
3,1603,1.667809,4.998536,0.0,1,0,1.667809,4.998536
4,1604,1.85566,4.939212,0.0,1,0,1.85566,4.939212


Unnamed: 0,index,a,b,class,y_0,y_1,x_0,x_1
395,1995,0.158777,2.90244,0.0,1,0,0.158777,2.90244
396,1996,1.268235,0.312893,1.0,0,1,1.268235,0.312893
397,1997,0.813015,4.217332,0.0,1,0,0.813015,4.217332
398,1998,0.687265,2.305148,0.0,1,0,0.687265,2.305148
399,1999,-0.662154,2.527399,0.0,1,0,-0.662154,2.527399


Unnamed: 0,x_0,x_1,y_0,y_1
0,2.052303,-0.87938,0,1
1,1.645049,2.279846,0,1
2,0.626488,6.327259,1,0
3,1.667809,4.998536,1,0
4,1.85566,4.939212,1,0


Unnamed: 0,x_0,x_1,y_0,y_1
395,0.158777,2.90244,1,0
396,1.268235,0.312893,0,1
397,0.813015,4.217332,1,0
398,0.687265,2.305148,1,0
399,-0.662154,2.527399,1,0


In [15]:
predict_df.to_csv(predict_file_name+"_all.csv")
predict_df_out.to_csv(predict_file_name+".csv")