In [1]:
# import the necessary libraries
import csv
import operator
from itertools import islice
from collections import namedtuple
from datetime import datetime

In [2]:
personalinfo_data_types = ['STRING', 'STRING', 'STRING', 'STRING','STRING']
employment_data_types = ['STRING', 'STRING', 'STRING', 'STRING']
updstatus_data_types = ['STRING', 'DATETIME', 'DATETIME']
vehicles_data_types = ['STRING', 'STRING', 'STRING', 'DATE']

In [3]:
def cast(data_type, value):
    """
    To convert the data into appropriate datatypes
    """
    if data_type == 'DOUBLE':
        return float(value)
    elif data_type == 'INT':
        return int(value)
    elif data_type =='DATETIME':
        return datetime.strptime(value,'%Y-%m-%dT%H:%M:%S%z').date()
    elif data_type == 'DATE':
        return datetime.strptime(value,'%Y').date().year
    else:
        return str(value)

In [4]:
def cast_row(data_types, data_row):
    return [cast(data_type, value) 
            for data_type, value in zip(data_types, data_row)]

In [5]:
def read_file(filename, data_types, header = False):
    with open(filename) as f:
        file_iter = iter(f)
        headers = next(file_iter).strip('\n').split(',')
        tuplename = filename.split('.')[0]
        named_tuple = namedtuple(tuplename, headers, defaults=(None,) * len(headers))
        if header:
            yield headers
        for line in file_iter:
            data = line.strip('\n').split(',')
            data = cast_row(data_types, data)
            iterdata = named_tuple(*data)
            yield iterdata

## **Goal 1**

Your first task is to create iterators for each of the four files that contained cleaned up data, of the correct type (e.g. string, int, date, etc), and represented by a named tuple

In [6]:
personal_info = read_file('personal_info.csv', personalinfo_data_types)
for row in islice(personal_info,10):
    print(row)

personal_info(ssn='100-53-9824', first_name='Sebastiano', last_name='Tester', gender='Male', language='Icelandic')
personal_info(ssn='101-71-4702', first_name='Cayla', last_name='MacDonagh', gender='Female', language='Lao')
personal_info(ssn='101-84-0356', first_name='Nomi', last_name='Lipprose', gender='Female', language='Yiddish')
personal_info(ssn='104-22-0928', first_name='Justinian', last_name='Kunzelmann', gender='Male', language='Dhivehi')
personal_info(ssn='104-84-7144', first_name='Claudianus', last_name='Brixey', gender='Male', language='Afrikaans')
personal_info(ssn='105-27-5541', first_name='Federico', last_name='Aggett', gender='Male', language='Chinese')
personal_info(ssn='105-85-7486', first_name='Angelina', last_name='McAvey', gender='Female', language='Punjabi')
personal_info(ssn='105-91-5022', first_name='Moselle', last_name='Apfel', gender='Female', language='Latvian')
personal_info(ssn='105-91-7777', first_name='Audi', last_name='Roach', gender='Female', language='E

In [7]:
# Creating an iterator for employment.csv using read_file() and printing the first 5 elements
employment_info = read_file('employment.csv', employment_data_types)
for row in islice(employment_info, 10):
    print(row)

employment(employer='Stiedemann-Bailey', department='Research and Development', employee_id='29-0890771', ssn='100-53-9824')
employment(employer='Nicolas and Sons', department='Sales', employee_id='41-6841359', ssn='101-71-4702')
employment(employer='Connelly Group', department='Research and Development', employee_id='98-7952860', ssn='101-84-0356')
employment(employer='Upton LLC', department='Marketing', employee_id='56-9817552', ssn='104-22-0928')
employment(employer='Zemlak-Olson', department='Business Development', employee_id='46-2886707', ssn='104-84-7144')
employment(employer='"Kohler', department=' Bradtke and Davis"', employee_id='Support', ssn='80-0975518')
employment(employer='"Roberts', department=' Torphy and Dach"', employee_id='Human Resources', ssn='77-4895332')
employment(employer='Lind-Jast', department='Marketing', employee_id='79-6418731', ssn='105-91-5022')
employment(employer='Bashirian-Lueilwitz', department='Engineering', employee_id='44-3328799', ssn='105-91-77

In [8]:
# Creating an iterator for update_status.csv using read_file() and printing the first 5 elements
updatestatus_info = read_file('update_status.csv', updstatus_data_types)
for row in islice(updatestatus_info, 10):
    print(row)

update_status(ssn='100-53-9824', last_updated=datetime.date(2017, 10, 7), created=datetime.date(2016, 1, 24))
update_status(ssn='101-71-4702', last_updated=datetime.date(2017, 1, 23), created=datetime.date(2016, 1, 27))
update_status(ssn='101-84-0356', last_updated=datetime.date(2017, 10, 4), created=datetime.date(2016, 9, 21))
update_status(ssn='104-22-0928', last_updated=datetime.date(2017, 3, 28), created=datetime.date(2016, 4, 15))
update_status(ssn='104-84-7144', last_updated=datetime.date(2018, 2, 19), created=datetime.date(2016, 3, 15))
update_status(ssn='105-27-5541', last_updated=datetime.date(2017, 7, 24), created=datetime.date(2016, 7, 23))
update_status(ssn='105-85-7486', last_updated=datetime.date(2018, 2, 14), created=datetime.date(2016, 12, 15))
update_status(ssn='105-91-5022', last_updated=datetime.date(2018, 3, 24), created=datetime.date(2016, 3, 24))
update_status(ssn='105-91-7777', last_updated=datetime.date(2017, 5, 11), created=datetime.date(2016, 5, 31))
update_st

In [9]:
# Creating an iterator for vehicles.csv using read_file() and printing the first 5 elements
vehicles_info = read_file('vehicles.csv', vehicles_data_types)
for row in islice(vehicles_info, 10):
    print(row)


vehicles(ssn='100-53-9824', vehicle_make='Oldsmobile', vehicle_model='Bravada', model_year=1993)
vehicles(ssn='101-71-4702', vehicle_make='Ford', vehicle_model='Mustang', model_year=1997)
vehicles(ssn='101-84-0356', vehicle_make='GMC', vehicle_model='Yukon', model_year=2005)
vehicles(ssn='104-22-0928', vehicle_make='Oldsmobile', vehicle_model='Intrigue', model_year=2000)
vehicles(ssn='104-84-7144', vehicle_make='Ford', vehicle_model='Crown Victoria', model_year=2008)
vehicles(ssn='105-27-5541', vehicle_make='Ford', vehicle_model='Mustang', model_year=2001)
vehicles(ssn='105-85-7486', vehicle_make='Chrysler', vehicle_model='300', model_year=2008)
vehicles(ssn='105-91-5022', vehicle_make='Isuzu', vehicle_model='Hombre Space', model_year=2000)
vehicles(ssn='105-91-7777', vehicle_make='Chevrolet', vehicle_model='Silverado 3500', model_year=2004)
vehicles(ssn='106-35-1938', vehicle_make='GMC', vehicle_model='Sonoma Club', model_year=1992)


## **Goal 2**

Create a single iterable that combines all the columns from all the iterators.

The iterable should yield named tuples containing all the columns. Make sure that the SSN's across the files match!

All the files are guaranteed to be in SSN sort order, and every SSN is unique, and every SSN appears in every file.

Make sure the SSN is not repeated 4 times - one time per row is enough!

In [10]:
header = ['ssn', 'first_name', 'last_name', 'gender', 'language','vehicle_make', 'vehicle_model', 'model_year',
             'employer', 'department', 'employee_id','last_updated', 'created']
        

class Combine:
    def __init__(self, personal_info,employment,update_status,vehicles):
        self.personal_info = personal_info
        self.employment = employment
        self.update_status = update_status
        self.vehicles = vehicles
        
    def __iter__(self):
        return Combine.combine_data(self.personal_info,self.vehicles,self.employment,self.update_status)  
    
    @staticmethod
    def combine_data(personal_info,vehicles,employment,update_status):
        
        personal_info = read_file(personal_info,personalinfo_data_types)
        employment = read_file(employment,employment_data_types)
        update_status = read_file(update_status,updstatus_data_types)
        vehicles = read_file(vehicles,vehicles_data_types)

        #creating a namedtuple
        information = namedtuple('Combination',header)

        for i in range(1000):
            data = next(personal_info)
            ssn, *data1 = next(employment)
            *data2, ssn = next(update_status)
            ssn, *data3 = next(vehicles)
            info = information(*data,*data1, *data2, *data3)
            yield info
        
        
files = 'personal_info.csv','employment.csv','update_status.csv','vehicles.csv'
combine = Combine(*files)


In [11]:
from itertools import islice

for row in islice(combine, 5):
    print(row, end="\n\n")

Combination(ssn='100-53-9824', first_name='Sebastiano', last_name='Tester', gender='Male', language='Icelandic', vehicle_make='Research and Development', vehicle_model='29-0890771', model_year='100-53-9824', employer='100-53-9824', department=datetime.date(2017, 10, 7), employee_id='Oldsmobile', last_updated='Bravada', created=1993)

Combination(ssn='101-71-4702', first_name='Cayla', last_name='MacDonagh', gender='Female', language='Lao', vehicle_make='Sales', vehicle_model='41-6841359', model_year='101-71-4702', employer='101-71-4702', department=datetime.date(2017, 1, 23), employee_id='Ford', last_updated='Mustang', created=1997)

Combination(ssn='101-84-0356', first_name='Nomi', last_name='Lipprose', gender='Female', language='Yiddish', vehicle_make='Research and Development', vehicle_model='98-7952860', model_year='101-84-0356', employer='101-84-0356', department=datetime.date(2017, 10, 4), employee_id='GMC', last_updated='Yukon', created=2005)

Combination(ssn='104-22-0928', first

In [12]:
combine_iter = iter(combine)
next(combine_iter)

Combination(ssn='100-53-9824', first_name='Sebastiano', last_name='Tester', gender='Male', language='Icelandic', vehicle_make='Research and Development', vehicle_model='29-0890771', model_year='100-53-9824', employer='100-53-9824', department=datetime.date(2017, 10, 7), employee_id='Oldsmobile', last_updated='Bravada', created=1993)