## Run script

### Import data

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath('scrape_sic'))
import scrape_sic_osha as scrape
import pickle
sys.path.insert(0, os.path.abspath('.'))

In [2]:
# Save all
divisions = scrape.save_divisions()
file_name = scrape.save_all_majors()

len(divisions), len(file_name)

(93, 83)

In [3]:
file_name = scrape.save_all_majors()

len(divisions), len(file_name)

(93, 83)

### Append data

In [4]:
# Load data
divisions = pickle.load(open('divisions_raw.pkl', 'rb'))

majors_all = []
for f in file_name:
    majors_all.extend(pickle.load(open(f, 'rb')))

len(divisions), len(majors_all)

(93, 1421)

In [5]:
combined = list(majors_all)
combined.extend(divisions)
len(combined)

1514

### Long to wide format

In [6]:
# Convert to dictionary
d_combined = {c.full_desc: c for c in combined}
len(d_combined)

1514

In [7]:
d_combined[combined[1].parent_desc]

ind_group(full_desc=u'Industry Group 011: Cash Grains', parent_desc='Major Group 01: Agricultural Production Crops', link=None)

In [8]:
d_combined['Major Group 10: Metal Mining']

ind_group(full_desc='Major Group 10: Metal Mining', parent_desc='Division B: Mining', link=u'sic_manual.display?id=6&tab=group')

#### SIC list

In [9]:
scrape.clean_desc(combined[2].full_desc)

['0112', 'SIC4', 'Rice']

In [10]:
sic = [c for c in combined if scrape.clean_desc(c.full_desc)[1] == 'SIC4']
len(sic)

1005

In [11]:
wide = []
for sic in combined:
    sic_fdesc = scrape.clean_desc(sic.full_desc)
    if sic_fdesc[1] == 'SIC4':
        
        # Clean SIC4
        SIC4_cd = sic_fdesc[0]
        SIC4_desc = sic_fdesc[2]
        
        # Clean industry
        ind = d_combined[sic.parent_desc.strip()]
        ind_fdesc = scrape.clean_desc(ind.full_desc)
        ind_cd = ind_fdesc[0]
        ind_desc = ind_fdesc[2]
        
        # Clean major
        maj = d_combined[ind.parent_desc.strip()]
        maj_fdesc = scrape.clean_desc(maj.full_desc)
        maj_cd = maj_fdesc[0]
        maj_desc = maj_fdesc[2]
        
        # Clean division
        div = d_combined[maj.parent_desc.strip()]
        div_fdesc = scrape.clean_desc(div.full_desc)
        div_cd = div_fdesc[0]
        div_desc = div_fdesc[2]
        
        wide.append((SIC4_cd, SIC4_desc, ind_cd, ind_desc, maj_cd, maj_desc, div_cd, div_desc))

len(wide), len(wide[1])

(1005, 8)

### Save and cleanup

In [12]:
import csv

In [13]:
# Save data
with open('osha_combined.pkl', 'w') as f:
    pickle.dump(wide, f)

with open('osha_combined.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(('SIC4_cd', 'SIC4_desc', 'ind_cd', 'ind_desc', 'maj_cd', 'maj_desc', 'div_cd', 'div_desc'))
    writer.writerows(wide)

In [14]:
# Clean up (OPTIONAL)
assert all([os.remove(s) is None for s in file_name])
assert os.remove('divisions_raw.pkl') is None