In [114]:
# Imports
import arrow
import csv
import re
from sys import stdin, stdout
import unittest

In [4]:
def to_seconds(time):
    units = [float(v) for v in time.split(':')]
    return sum((coeff * unit for coeff, unit in zip([3600, 60, 1], units)))

In [80]:
def to_iso8601(datetime, timezone):
    match = re.match(r'(\d\d?)/(\d\d?)/(\d\d?) (\d\d?)(.*)', datetime)
    time = arrow.get('20{2:0>2}-{0:0>2}-{1:0>2}T{3:0>2}{4} {5}'.format(*match.groups(), 'US/Pacific'), 'YYYY-MM-DDTHH:ss:mm A ZZZ')
    return time.to(timezone).format('YYYY-MM-DDTHH:mm:ssZZ')

In [106]:
to_zip = lambda x: '{0:0>5.5}'.format(x)

In [107]:
def normalize(in_filepath, out_filepath):
    csv.register_dialect('truss', delimiter=',', escapechar=None, quoting=csv.QUOTE_MINIMAL)
    with open(in_filepath, mode='rt', encoding='utf-8', errors='replace') as raw, \
         open(out_filepath, mode='w+', encoding='utf-8', errors='ignore') as out:
        reader = csv.DictReader(raw)
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(out, fieldnames=fieldnames)
        writer.writeheader()
        for row in reader:
            # Timestamp : to ISO-8601 format in US/Eastern timezone
            try:
                row['Timestamp'] = to_iso8601(row['Timestamp'], 'US/Eastern')
            except AttributeError as error:
                print('WARNING: row {0}\ndropped while normalizing timestamp due to error:\n{1}'.format(list(row.values()), error))
                continue
            # Address : no changes
            # ZIP : limit to 5 digits, prefix with 0
            row['ZIP'] = to_zip(row['ZIP'])
            # FullName : to uppercase
            row['FullName'] = row['FullName'].upper()
            # FooDuration, BarDuration : HH:MM:SS.MS format to seconds (float)
            row['FooDuration'] = to_seconds(row['FooDuration'])
            row['BarDuration'] = to_seconds(row['BarDuration'])
            # TotalDuration : replace with sum of FooDuration and BarDuration
            row['TotalDuration'] = row['FooDuration'] + row['BarDuration']
            # Notes : no changes
            writer.writerow(row)

In [112]:
class TestNormalizer(unittest.TestCase):

    def to_iso8601(self):
        self.assertEqual(to_iso8601('1/1/00 1:01:01 AM'), '2000-01-01T04:01:01-4:00')
        self.assertEqual(to_iso8601('10/11/12 01:23:45 PM'), '2012-10-11T16:23:45-4:00')
        self.assertEqual(to_iso8601('10/11/12 11:23:45 PM'), '2012-10-12T02:23:45-4:00')

    def to_seconds(self):
        self.assertEqual(to_seconds('0:0:0'), 0)
        self.assertEqual(to_seconds('0:0:1.0'), 1)
        self.assertEqual(to_seconds('0:1:0'), 60)
        self.assertEqual(to_seconds('1:0:0'), 3600)
        self.assertEqual(to_seconds('0:0:32.123'), 32.123)
        self.assertEqual(to_seconds('0:1:32.123'), 92.123)
        self.assertEqual(to_seconds('1:0:32.123'), 3632.123)
        
    def to_zip(self):
        self.assertEqual(to_zip(''), '00000')
        self.assertEqual(to_zip('123'), '00123')
        self.assertEqual(to_zip('12345'), '12345')
        self.assertEqual(to_zip('1234567'), '12345')

#if __name__ == '__main__':
#    unittest.main()

In [113]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)


----------------------------------------------------------------------
Ran 0 tests in 0.000s

OK


In [115]:
normalize('sample-with-broken-utf8.csv', 'output.csv')

dropped while normalizing timestamp due to error:
'NoneType' object has no attribute 'groups'
