# About this file

Data normalization takes a csv file, and outputs a set of public CSV's with one column and private ordering files. 

Each public file is associated with a corresponding private file.

The public file consists of a shuffled column. The first line is the column name, and the rest of the file consists of every entry in a shuffled order.

The private file is a JSON that maps each row entry to its correct position. The JSON should also contain other metadata about the public CSV file.

There is a corresponding file that reverses this and writes the changes to raw CSV file.

In [44]:
from functools import partial
import numpy as np
import pandas as pd
import re
import random
import json

import constants

# Configure any settings
pd.set_option('display.max_columns', None)

In [45]:
# Declare any constants
raw_file = '../data/raw.csv'
rows_output_directory = '../private/rows/'
indices_output_directory = '../private/indices'

In [46]:
df = pd.read_csv(raw_file)
df.columns = constants.columns
df.to_csv('temp.csv')

In [47]:
# Creates an array with length `length` that contains unique integers from 0 to length-1
def generate_shuffled_indices(length):
    array = [i for i in range(0, length)]
    random.shuffle(array)
    return array

In [48]:
# Takes the rows, and shuffles them using the shuffled_indices
def shuffle_rows(row_values, indices):
    rows = [None] * len(indices)
    for i, new_index in enumerate(indices):
        rows[new_index] = row_values[i]
    return rows

In [49]:
# Takes the shuffled rows, and writes them to a public folder
def write_public_column(col_name, rows, filename):
    full_list = [(col_name, rows)]
    df = pd.DataFrame.from_items(full_list)
    df.to_csv(rows_output_directory + filename)

In [50]:
# Writes the shuffled indices to a file
def write_private_indices(col_name, shuffled_indices, filename):
    output = {
        'column': col_name,
        'public_file': col_name + '.csv',
        'order': shuffled_indices
    }
    f = open(indices_output_directory + filename, 'w')
    f.write(json.dumps(output))

In [51]:
row_count = df.shape[0]

# This is where the main work gets done.
for col in constants.columns_to_normalize:
    # Extract the column
    row_values = df[col].tolist()
    
    # Create shuffled indices
    shuffled_indices = generate_shuffled_indices(row_count)
    
    # Shuffle the column
    shuffled_rows = shuffle_rows(row_values, shuffled_indices)
    
    # Write the column to a public csv file
    write_public_column(col, shuffled_rows, col + '.csv')
    
    # Write the shuffled indices to a private file
    write_private_indices(col, shuffled_indices, col + '.json')