# Train and Test Split

Splits the data into a training data file and testing data file based on parameters set.

## Parameters

1. `data_file`: The data file in CSV with format `x1,x2,x3...y` where `y` is `1` if row is nominal point and `-1` if row is anomaly point.

2. `percent_anomaly`: The ratio of anomalies for validation.

3. `train_file`: The output training file in CSV with format `x1,x2,x3...xn`

4. `validation_file`: The output validation file in CSV with format `x1,x2,x3...y` where `y` is `1` for nominal points and `-1` for outlier points.

In [1]:
import pandas as pd

In [2]:
data_file = "/home/ralampay/workspace/pyno/data/creditcardfraud.csv"
percent_anomaly = 0.05
percent_training = 0.7
train_file = "~/Desktop/creditcardfraud_train.csv"
validation_file = "~/Desktop/creditcardfraud_validation.csv"
chunk_size = 1000

In [3]:
data = pd.DataFrame()

for i, chunk in enumerate(pd.read_csv(data_file, header=None, chunksize=chunk_size)):
    data = data.append(chunk)

input_dim = len(data.columns) - 1

nominal_data = data[data[input_dim] == 1]
anomaly_data = data[data[input_dim] == -1]

training_data = nominal_data.sample(frac=percent_training).iloc[:,:input_dim]
validation_data = nominal_data.drop(training_data.index)

In [4]:
number_of_anomalies = int(percent_anomaly * (len(validation_data) + len(anomaly_data)))

print("Percent Anomaly: {}".format(percent_anomaly))
print("Number of anomalies: {}".format(number_of_anomalies))

sampled_anomalies = anomaly_data.sample(n=number_of_anomalies, replace=True)

validation_data = pd.concat([validation_data, sampled_anomalies])

num_training_data = len(training_data)
num_validation_data = len(validation_data)
num_validation_anomalies = len(validation_data[validation_data[input_dim] == -1])
num_validation_nominal = len(validation_data[validation_data[input_dim] == 1])

print("Num Training Data: {}".format(num_training_data))
print("Num Validation Data: {}".format(num_validation_data))
print("Num Validation Anomalies: {}".format(num_validation_anomalies))
print("Num Validation Nominal: {}".format(num_validation_nominal))

print("Saving training data to {}".format(train_file))
training_data.to_csv(train_file, header=False, index=False)

print("Saving validation data to {}".format(validation_file))
validation_data.to_csv(validation_file, header=False, index=False)

Percent Anomaly: 0.05
Number of anomalies: 4289
Num Training Data: 199020
Num Validation Data: 89584
Num Validation Anomalies: 4289
Num Validation Nominal: 85295
Saving training data to ~/Desktop/creditcardfraud_train.csv
Saving validation data to ~/Desktop/creditcardfraud_validation.csv
