In [94]:
# Imports
import arrow
import csv
from pprint import pprint
import re
import sys
import unittest

[x] The entire CSV is in the UTF-8 character set.
[x] The Timestamp column should be formatted in ISO-8601 format.
[x] The Timestamp column should be assumed to be in US/Pacific time; please convert it to US/Eastern.
[x] All ZIP codes should be formatted as 5 digits. If there are less than 5 digits, assume 0 as the prefix.
[x] The FullName column should be converted to uppercase. There will be non-English names.
[x] The Address column should be passed through as is, except for Unicode validation. Please note there are commas in the Address field; your CSV parsing will need to take that into account. Commas will only be present inside a quoted string.
[x] The FooDuration and BarDuration columns are in HH:MM:SS.MS format (where MS is milliseconds); please convert them to the total number of seconds expressed in floating point format. You should not round the result.
[x] The TotalDuration column is filled with garbage data. For each row, please replace the value of TotalDuration with the sum of FooDuration and BarDuration.
[x] The Notes column is free form text input by end-users; please do not perform any transformations on this column. If there are invalid UTF-8 characters, please replace them with the Unicode Replacement Character.
[x] You can assume that the input document is in UTF-8 and that any times that are missing timezone information are in US/Pacific. If a character is invalid, please replace it with the Unicode Replacement Character. If that replacement makes data invalid (for example, because it turns a date field into something unparseable), print a warning to stderr and drop the row from your output.

In [4]:
def to_seconds(time):
    units = [float(v) for v in time.split(':')]
    return sum((coeff * unit for coeff, unit in zip([3600, 60, 1], units)))

In [80]:
def to_iso8601(datetime, timezone):
    match = re.match(r'(\d\d?)/(\d\d?)/(\d\d?) (\d\d?)(.*)', datetime)
    time = arrow.get('20{2:0>2}-{0:0>2}-{1:0>2}T{3:0>2}{4} {5}'.format(*match.groups(), 'US/Pacific'), 'YYYY-MM-DDTHH:ss:mm A ZZZ')
    return time.to(timezone).format('YYYY-MM-DDTHH:mm:ssZZ')

In [90]:
def normalize(in_filepath, out_filepath):
    csv.register_dialect('truss', delimiter=',', escapechar=None, quoting=csv.QUOTE_MINIMAL)
    with open(in_filepath, mode='rt', encoding='utf-8', errors='replace') as raw, \
         open(out_filepath, mode='w+', encoding='utf-8', errors='ignore') as out:
        reader = csv.DictReader(raw)
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(out, fieldnames=fieldnames)
        writer.writeheader()
        for row in reader:
            # Timestamp : to ISO-8601 format in US/Eastern timezone
            try:
                row['Timestamp'] = to_iso8601(row['Timestamp'], 'US/Eastern')
            except AttributeError as error:
                print('WARNING: row {0}\ndropped while normalizing timestamp due to error:\n{1}'.format(list(row.values()), error))
                continue
            # Address : no changes
            # ZIP : limit to 5 digits, prefix with 0
            row['ZIP'] = '{0:0>5.5}'.format(row['ZIP'])
            # FullName : to uppercase
            row['FullName'] = row['FullName'].upper()
            # FooDuration, BarDuration : HH:MM:SS.MS format to seconds (float)
            row['FooDuration'] = to_seconds(row['FooDuration'])
            row['BarDuration'] = to_seconds(row['BarDuration'])
            # TotalDuration : replace with sum of FooDuration and BarDuration
            row['TotalDuration'] = row['FooDuration'] + row['BarDuration']
            # Notes : no changes
            writer.writerow(row)

In [103]:
class TestNormalizer(unittest.TestCase):

    def to_seconds(self):
        self.assertEqual(to_seconds('0:0:0'), 0)
        self.assertEqual(to_seconds('0:0:1.0'), 1)
        self.assertEqual(to_seconds('0:1:0'), 60)
        self.assertEqual(to_seconds('1:0:0'), 3600)
        self.assertEqual(to_seconds('0:0:32.123'), 32.123)
        self.assertEqual(to_seconds('0:1:32.123'), 92.123)
        self.assertEqual(to_seconds('1:0:32.123'), 3632.123)

    def to_iso8601(self):
        self.assertEqual(to_iso8601('1/1/00 1:01:01 AM'), '2000-01-01T04:01:01-4:00')

#if __name__ == '__main__':
#    unittest.main()

In [104]:
unittest.main()

E
ERROR: C:\Users\Rachel\AppData\Roaming\jupyter\runtime\kernel-35fe3d62-96a4-443a-ad2c-da0a7f9404ed (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute 'C:\Users\Rachel\AppData\Roaming\jupyter\runtime\kernel-35fe3d62-96a4-443a-ad2c-da0a7f9404ed'

----------------------------------------------------------------------
Ran 1 test in 0.001s

FAILED (errors=1)


SystemExit: True

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [92]:
#normalize(*sys.argv[1:3])
normalize('sample-with-broken-utf8.csv', 'output.csv')

dropped while normalizing timestamp due to error:
'NoneType' object has no attribute 'groups'
