In [1]:
import os
import pandas as pd
from itertools import islice
import re

### 1 - Load and parse raw OPD data files

In [2]:
input_dir = os.path.join('..', 'data', 'raw')
output_dir = os.path.join('..', 'data', 'interim')

# Get all .sps files in the input directory as helper files for parsing the raw OPD data files
helper_files = [os.path.join(input_dir, f) for f in sorted(os.listdir(input_dir)) if f.endswith('.sps')]
print(f'Helper files to be loaded: \n{helper_files}\n')

for f in helper_files:
    year = re.search(r'\d{4}', f).group()
    
    ## Load the helper file into a pandas DataFrame

    with open(f, 'r', encoding='windows-1252') as file:
        # Keep the trunk after 'DATA LIST...' and before the first '.'
        for i, line in enumerate(file):
            if line.startswith('DATA LIST'):
                start = i + 1
            if line.startswith('.'):
                end = i
                break
        
        # Reset the file pointer to the beginning of the file
        file.seek(0)

        helper = list(islice(file, start, end))
        helper = [line.strip() for line in helper]

    # Convert the helper to a DataFrame, split each line by the first whitespace
    helper = pd.DataFrame([line.replace(' (A)', '').split(maxsplit=1) for line in helper]).rename(columns={0:'name', 1: 'range'})

    # Remove the rows that ends with digits followed by 'D'
    helper = helper[~helper['name'].str.contains(r'\dD$')].reset_index().drop(columns='index')

    # Remove the space in the range column
    helper['range'] = helper['range'].str.replace(' ', '')

    # Add in 'BLANK' variables for 2009
    if year == '2009':
        helper.loc[len(helper)] = ['BLANK1', '79-80']
        helper.loc[len(helper)] = ['BLANK2', '159']
        helper.loc[len(helper)] = ['BLANK3', '300-301']

    # Split the range into start and end, fill the end with the start if the range is a single number
    helper['start'] = helper['range'].apply(lambda x: eval(x.split('-')[0]))
    helper['end'] = helper['range'].apply(lambda x: eval(x.split('-')[1]) if ('-' in x) else eval(x.split('-')[0]))

    # Get the width of each variable
    helper['width'] = helper['range'].apply(lambda x: abs(eval(x)) + 1 if ('-' in x) else 1)

    # Sort the helper by the start column and reset the index
    helper = helper.sort_values('start').reset_index().drop(columns='index')

    ## Load the raw OPD data file into a pandas DataFrame using the helper
    opd_file = os.path.join(input_dir, f'OPD{year}')
    print(f'Parsing and converting `{opd_file}`')
    print('...')

    opd_df = pd.read_fwf(opd_file, widths=helper['width'], names=helper['name'], header=None)

    # Save the parsed OPD data to the interim directory
    output_file = os.path.join(output_dir, f'opd{year}.parquet')
    opd_df.to_parquet(output_file)
    print(f'`{output_file}` exported\n')
        

Helper files to be loaded: 
['../data/raw/opd2006.sps', '../data/raw/opd2007.sps', '../data/raw/opd2008.sps', '../data/raw/opd2009.sps', '../data/raw/opd2010.sps', '../data/raw/opd2011.sps']

Parsing and converting `../data/raw/OPD2006`
...
`../data/interim/opd2006.parquet` exported

Parsing and converting `../data/raw/OPD2007`
...
`../data/interim/opd2007.parquet` exported

Parsing and converting `../data/raw/OPD2008`
...
`../data/interim/opd2008.parquet` exported

Parsing and converting `../data/raw/OPD2009`
...
`../data/interim/opd2009.parquet` exported

Parsing and converting `../data/raw/OPD2010`
...
`../data/interim/opd2010.parquet` exported

Parsing and converting `../data/raw/OPD2011`
...
`../data/interim/opd2011.parquet` exported

