# About this file

This file takes all of the files that have been normalized, and writes them in a new `responses-{date}.csv` file. Since the results file is not saved in version control, it will create a different response file each day. Again, this is only because we don't want to commit private data into version control.

To ensure that data never gets lost, we can also maintan this git project inside of a Dropbox/Drive. This is to prevent loss of any work in case the files get accidentally deleted.

In [36]:
from functools import partial
import numpy as np
import pandas as pd
import re
import random
import json
import datetime
import os

import constants

# Configure any settings
pd.set_option('display.max_columns', None)

In [37]:
# Declare any constants
raw_file = '../data/raw.csv' # REPLACE THIS WITH THE MOST EXISTING DATA SET FILEPATH
results_directory = '../private/' # Path where we'll create the data set with the normalized columns
normalized_rows_directory = '../private/normalized_rows/' # Path with the normalized rows
indices_directory = '../private/indices/'

In [38]:
df = pd.read_csv(raw_file)
df.columns = constants.columns

In [39]:
# Grabs the metadata file given a filename, and returns the shuffled indices
def read_indices(filename):
    with open(indices_directory + filename, 'r') as f:
        metadata = json.loads(f.read())
        return metadata['order']

In [40]:
# Returns a list of all of the rows values
def read_normalized_rows(filename):
    if not os.path.isfile(normalized_rows_directory + filename):
        return None
    df = pd.read_csv(normalized_rows_directory + filename)
    return df[df.columns[1]].tolist()

In [41]:
# Place each row into its original location
def unshuffle_rows(rows, indices):
    buffer = [None] * len(indices)
    for i, index in enumerate(indices):
        buffer[i] = rows[index]
    return buffer

In [42]:
row_count = df.shape[0]

for col in constants.columns_to_normalize:
    # Read the normalized rows
    rows_filename = col + '.csv'
    normalized_rows = read_normalized_rows(rows_filename)

    # File doesn't exist
    if normalized_rows is None:
        continue
    
    # Read the private indices
    indices_filename = col + '.json'
    indices = read_indices(indices_filename)
    
    # Unshuffle the rows
    column_data = unshuffle_rows(normalized_rows, indices)
    
    # Save it inside the dataframe
    df[col] = column_data

In [43]:
now = datetime.datetime.now()
formatted_date = now.strftime("%m-%d")
df.to_csv(results_directory + 'results-' + formatted_date + '.csv', index=False)