# Data Cleaning and Normalization

Routines for cleaning and normalizing data for processing.

Objectives:

1. Reading a CSV file and representing it as a pandas data frame
2. Normalizing original data frame and transforming to independent data frame
3. Transforming data frame to forma readable by autoencoder

In [None]:
# Import necessary libraries and path relative to project
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

## Importing Data

In [None]:
# Instantiate pandas DataFrame
data = pd.DataFrame()

# Chunk size for reading data
chunksize = 10000

# The reference to the dataset. Change this to 
dataset_file = '../data/creditcardfraud_raw.csv'

print("Loading dataset '{}'...".format(dataset_file))

# Read each chunk and append to data frame
for i, chunk in enumerate(pd.read_csv(dataset_file, chunksize=chunksize)):
    print("Reading chunk %d" % (i + 1))
    data = data.append(chunk)

print("Done loading dataset...")
    
# Check for proper value of input dimensionality to be used by model
input_dim = len(data.columns) - 1
print("Input Dimensionality: %d" % (input_dim))
print(data)
print("Dropping Time column")
data = data.drop(['Time'], axis=1)
print(data)

## Normalizing the data with `MinMaxScalar`

In [None]:
# create a scaler object
scaler = MinMaxScaler()

# fit and transform the data
df_norm = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

print(df_norm)