Initializing Libraries and Global Variables

In [294]:
import pickle
import pandas as pd
import psycopg2
import os
import itertools

In [295]:
path = r'D:\\Projects\\ipums\\garrard_test\\'
dataname = r'ky_garrard_1870_88v_test_usa_00006.dat'
cbkname = r'ky_1870_cbk.pickle'
prefix = 'cn1870_'
table_name = 'census_1870'
cbkfile = path + cbkname
datafile = path + dataname

Pickle Functions

In [296]:
#Pickle functions
def load_pickle(filename, objectname):
        with open(path+filename, "rb") as infile:
            objectname = pickle.load(infile)
        print('Loaded from:', filename)

def save_pickle(filename, objectname):
    with open(path+filename, "wb") as outfile:
        pickle.dump(objectname, outfile)
    print('Saved to:',filename)

Loading the Codebook and Creating the Slice List

In [297]:
cbk = pd.read_pickle(cbkfile)
x = cbk['length'].astype('str')
slice_list = cbk['slice_obj'].tolist()
print('slice_list:\n',slice_list[0:10])

slice_list:
 [slice(0, 4, None), slice(4, 12, None), slice(12, 14, None), slice(14, 16, None), slice(16, 20, None), slice(20, 21, None), slice(21, 23, None), slice(23, 25, None), slice(25, 27, None), slice(27, 31, None)]


Database Functions (psycog2)

In [298]:
def open_database(database):
    conn = psycopg2.connect(dbname=database, user='postgres', password = '3701', host='localhost', port='5432')
    return conn

def create_cursor():
    cursor = conn.cursor()
    return cursor

def set_census_schema():
    cursor.execute("SET SCHEMA 'census'")

def close_database():
    conn.commit() #insert rollback logic
    conn.close()   

Data Processing Functions

In [299]:
def process_line(line, slices):
    limit = len(line)
    data = []
    for i in slices:
        len_slice = len(line[i])
        begin = limit
        limit = limit - len_slice
        if limit <= 0: break
        data.append(line[i])
    return data

def insert_tuple(tablename, ph, tupledata):
    query = r"insert into " + tablename + " values " + ph + "returning *"
    print(query)
    cursor.execute(query, (tupledata,))
    rs = cursor.fetchall()
    conn.commit()
    for row in rs:
        print (row)
    #cursor.execute(query, (data,)) # notice the comma after the tuple

In [300]:
#cursor.execute("ROLLBACK")

Open the Database

In [301]:
conn = open_database('test')
cursor = create_cursor()
set_census_schema()

Process the Datafile (line by line) and Upload it to the Database

In [302]:
#input (
#p = process from zero (set rec_count=0) - are you sure? This will wipe out resume data (y = proceed, anything else = repeat input)
#r = read json, resume from (processed_count + 1) - - starting from record # (processed_count). are you sure?
#x = break
#else = repeat input loop)
rec_count = 0

#convert to a while loop. Break out either at end of # of (count) of cycles = records, or at the end of file (line == "")
#maybe itertools?
with open(datafile) as f: # convert to iterator with itertools
    for i in range(records): # exclude this line if you are using a while loop (it's a wrapper); or maybe use itertools.count()
        line = f.readline() # this pulls the next line.
        if line == "": break # or if end of #records reached
        data = tuple(process_line(line, slice_list))
        print(type(data),data)
        total_fields = (len(data))
        ph = r'%s '*total_fields
        #insert_tuple(table_name, ph, data)
        rec_count += 1      
#while loop goes at the bottom - if endpoint is true, pass result, increment and break. If endpoint is false, repeat (while) loop.
print('Total fields:',total_fields)
print('Total records processed:', rec_count)
#save rec_count as processed_count in the json file

In [371]:
#filename - first a scalar, then a list to process more (* in a directory, or list of filenames)
#records are the number of records to process in this cycle (capped at the end of file)
rec_count = 1
processed_count = 0
records = 4
end_range = processed_count + 1 + records
# now proceeding to process the records
with open(datafile) as f: 
    while (rec_count < end_range):
        line = f.readline() # this pulls the next line.
        if line == "": break
        print ('record number:',rec_count)
        if (rec_count) <= processed_count:
            print('skipped')
            rec_count += 1
            continue
        data = tuple(process_line(line, slice_list))
        print(type(data),data)
        total_fields = (len(data))
        ph = r'%s '*total_fields
        #insert_tuple(table_name, ph, data)
        rec_count += 1
print('Total fields:',total_fields)
print('Start record:',processed_count+1)
print('End record:',rec_count-1)
print('Total records processed:', (rec_count - processed_count - 1))
#save rec_count as processed_count in the json file

record number: 1
skipped
record number: 2
skipped
record number: 3
skipped
record number: 4
skipped
record number: 5
skipped
record number: 6
skipped
record number: 7
skipped
record number: 8
skipped
record number: 9
<class 'tuple'> ('1870', '02095399', '06', '13', '0006', '2', '32', '51', '21', '0810', '2100810', '1', '000', '0000', '0000', '0000000', '01', '1', '0000000', '0001597', '840', '1', '0', '000', '00', '1', '02', '0', '1', '0', '0', '00073261', '0006', '2F3F2E08-B1F6-4876-A9AD-0ABCE5DEDAA6', '0003', '01', '05', '00', '0', '0', '00', '0', '0', '00', '0', '0', '0', '0', '99', '99', '10', '1001', '10', '1', '002', '99', '00', '1868', '1', '1', '100', '0', '000', '021', '02100', '1', '0', '0', '1', '0', '0', '0999', '999', '000', '00000000', '00000000', '03', 'EEEA2A48-4968-4392-9766-D28B1727C59D', '01', '                     ', '0', '0', '0', '0', '0', '0', '0', '3.0-1.1')
record number: 10
<class 'tuple'> ('1870', '02095399', '06', '13', '0006', '2', '32', '51', '21', '0810',

Commit and Close


In [None]:
close_database()

Inserting the tuples as a single value  

query = """  
    insert into t values %s  
    returning *  
"""
my_tuple = (2, 'b')  

cursor.execute(query, (my_tuple,)) # Notice the comma after my_tuple  
rs = cursor.fetchall()  
conn.commit()  
for row in rs:  
    print row  