In [305]:
import csv
import re

In [306]:
with open('H1B_FY_2015.csv') as f:
    lines = list(csv.reader(f, delimiter=';'))

In [307]:
lines[0]

['',
 'CASE_NUMBER',
 'CASE_STATUS',
 'CASE_SUBMITTED',
 'DECISION_DATE',
 'VISA_CLASS',
 'EMPLOYMENT_START_DATE',
 'EMPLOYMENT_END_DATE',
 'EMPLOYER_NAME',
 'EMPLOYER_ADDRESS1',
 'EMPLOYER_ADDRESS2',
 'EMPLOYER_CITY',
 'EMPLOYER_STATE',
 'EMPLOYER_POSTAL_CODE',
 'EMPLOYER_COUNTRY',
 'EMPLOYER_PROVINCE',
 'EMPLOYER_PHONE',
 'EMPLOYER_PHONE_EXT',
 'AGENT_ATTORNEY_NAME',
 'AGENT_ATTORNEY_CITY',
 'AGENT_ATTORNEY_STATE',
 'JOB_TITLE',
 'SOC_CODE',
 'SOC_NAME',
 'NAIC_CODE',
 'TOTAL WORKERS',
 'FULL_TIME_POSITION',
 'PREVAILING_WAGE',
 'PW_UNIT_OF_PAY',
 'PW_WAGE_LEVEL',
 'PW_WAGE_SOURCE',
 'PW_WAGE_SOURCE_YEAR',
 'PW_WAGE_SOURCE_OTHER',
 'WAGE_RATE_OF_PAY',
 'WAGE_UNIT_OF_PAY',
 'H-1B_DEPENDENT',
 'WILLFUL VIOLATOR',
 'WORKSITE_CITY',
 'WORKSITE_COUNTY',
 'WORKSITE_STATE',
 'WORKSITE_POSTAL_CODE']

In [308]:
def header_index(arg, header):
    for i, ele in enumerate(header):
        if re.match(arg, ele):
            return i
    raise NameError('column not found')

def populate_indices(header):
    indices = {}
    indices['status'] = header_index('.*STATUS.*', header)
    indices['soc'] = header_index('.*SOC_CODE.*', header)
    indices['job_name'] = header_index('.*SOC_NAME.*', header)
    indices['job_state'] = header_index('.*EMPLOYER_STATE.*', header)
    return indices    

In [309]:
indices = populate_indices(lines[0])
indices

{'status': 2, 'soc': 22, 'job_name': 23, 'job_state': 12}

In [310]:
def check_soc_hyphen(soc):
    if len(soc) < 6:
        return soc
    soc_lst = list(soc)
    
    if soc_lst[2] != '-':
        if soc_lst[2] == '.':
            soc_lst[2] = '-'
        else:
            soc_lst.insert(2, '-')
    return ''.join(soc_lst)

def clean_soc(soc):
    soc_strip = soc.replace(' ', '')
    soc_hyphen = check_soc_hyphen(soc_strip)
    return soc_hyphen[:7]     

In [311]:
occupation_dict = {}
occupation_count = {}
state_count = {}
certified_count = 0

for line in lines[1:]:
    if line[indices['status']] != 'CERTIFIED':
        continue    
    
    soc = clean_soc(line[indices['soc']])  
    job_name = line[indices['job_name']]
    job_state = line[indices['job_state']]
    if not re.match('\d{2}-\d{4}', soc) or not job_state:
        continue
        
    certified_count += 1
    
    if soc not in occupation_dict:
        occupation_dict[soc] = job_name
        occupation_count[soc] = 0
    elif len(job_name) > len(occupation_dict[soc]):
        occupation_dict[soc] = job_name
    occupation_count[soc] += 1
    
    state = line[indices['job_state']]
    if state not in state_count:
        state_count[state] = 0
    state_count[state] += 1
    

In [312]:
occupation_dict

{'25-1032': 'ENGINEERING TEACHERS, POSTSECONDARY',
 '17-2072': 'ELECTRONICS ENGINEERS, EXCEPT COMPUTER',
 '15-1131': 'SOFTWARE DEVELOPERS, APPLICATIONS',
 '15-1132': 'COMPUTER SOFTWARE ENGINEERS, APPLICATIONS',
 '15-2031': 'OPERATIONS RESEARCH CONSULTANTS',
 '15-1121': 'COMPUTER SOFTWARE ENGINEERS, APPLICATIONS',
 '15-1199': 'COMPUTER OCCUPATIONS, ALL OTHER: INFORMATION TECHN',
 '15-1141': 'DATABASE ADMINISTRATORS',
 '27-4032': 'FILM AND VIDEO EDITORS',
 '13-2051': 'FINANCIAL ANALYSTS',
 '17-2081': 'WATER/WASTEWATER ENGINEERS',
 '27-3031': 'PUBLIC RELATIONS SPECIALISTS',
 '15-2041': 'CLINICAL DATA MANAGERS',
 '17-2141': 'MECHANICAL ENGINEERING',
 '15-1142': 'NETWORK AND COMPUTER SYSTEMS ADMINISTRATORS*',
 '17-2021': 'AGRICULTURAL ENGINEERS',
 '13-2011': 'OPERATIONS RESEARCH ANALYSTS',
 '17-2171': 'PETROLEUM ENGINEERS',
 '19-2032': 'MATERIALS SCIENTISTS',
 '17-1011': 'ARCHITECTS, EXCEPT LANDSCAPE AND NAVAL',
 '15-1134': 'WEB DEVELOPERS',
 '13-1161': 'MARKET RESEARCH ANALYSTS AND MARKETI

In [313]:
occupation_count

{'25-1032': 846,
 '17-2072': 5370,
 '15-1131': 81165,
 '15-1132': 88900,
 '15-2031': 6113,
 '15-1121': 108276,
 '15-1199': 53028,
 '15-1141': 7515,
 '27-4032': 75,
 '13-2051': 8199,
 '17-2081': 388,
 '27-3031': 1046,
 '15-2041': 3212,
 '17-2141': 7350,
 '15-1142': 9621,
 '17-2021': 83,
 '13-2011': 9915,
 '17-2171': 381,
 '19-2032': 667,
 '17-1011': 941,
 '15-1134': 5123,
 '13-1161': 6691,
 '15-1021': 18,
 '27-1024': 1889,
 '19-4041': 100,
 '19-1013': 375,
 '17-2041': 997,
 '11-3071': 397,
 '15-1143': 655,
 '19-2031': 1764,
 '21-1015': 105,
 '25-1021': 474,
 '11-9199': 718,
 '17-2071': 5091,
 '27-1025': 473,
 '29-1128': 42,
 '25-1031': 93,
 '13-2099': 1194,
 '29-1123': 3474,
 '11-3021': 4866,
 '13-1081': 1403,
 '25-3099': 255,
 '19-4061': 357,
 '25-1071': 1361,
 '25-9031': 486,
 '15-1133': 15382,
 '11-1011': 548,
 '17-2121': 197,
 '17-2011': 255,
 '25-2021': 1244,
 '25-4012': 126,
 '15-1034': 310,
 '13-1111': 12065,
 '11-2021': 2327,
 '17-2112': 2859,
 '23-1011': 1250,
 '29-1069': 4836,

In [314]:
print(len(state_count))
state_count

55


{'OK': 940,
 'TX': 84626,
 'VA': 18338,
 'CA': 86066,
 'NJ': 79950,
 'WI': 3398,
 'NY': 37508,
 'MI': 19792,
 'IL': 33119,
 'NC': 17456,
 'CT': 5257,
 'DE': 2080,
 'WA': 12568,
 'MD': 23115,
 'PA': 23437,
 'MN': 3647,
 'IA': 1600,
 'KS': 1628,
 'CO': 3100,
 'AR': 1565,
 'MA': 17622,
 'FL': 15519,
 'OH': 8259,
 'LA': 1488,
 'NV': 823,
 'MS': 432,
 'MO': 3763,
 'GA': 15023,
 'SC': 982,
 'MP': 52,
 'RI': 961,
 'IN': 2536,
 'NM': 703,
 'SD': 225,
 'AZ': 3019,
 'DC': 1958,
 'TN': 3195,
 'ID': 425,
 'OR': 1607,
 'AL': 1221,
 'UT': 1262,
 'KY': 1893,
 'HI': 329,
 'GU': 423,
 'NE': 1128,
 'ME': 439,
 'ND': 357,
 'AK': 89,
 'NH': 1074,
 'WV': 274,
 'PR': 121,
 'VT': 587,
 'VI': 67,
 'MT': 81,
 'WY': 125}

In [315]:
certified_count

547252

In [316]:
def ratio_formatted(job_count, total):
    num = round(100 * float(job_count) / total, 1)
    return str(num) + '%'

def occupation_analysis(count, names, total):
    sorted_occupations = sorted(count.items(), key=lambda kv: kv[1])
    sorted_occupations.reverse()
    top_ten = sorted_occupations[:10]
    
    occupation_answer = ['TOP_OCCUPATIONS;NUMBER_CERTIFIED_APPLICATIONS;PERCENTAGE']
    for job in top_ten:
        row = []
        row.append(names[job[0]])
        row.append(str(job[1]))
        row.append(ratio_formatted(job[1], total))
        occupation_answer.append(';'.join(row))
    return "\n".join(occupation_answer)
        

In [317]:
print(occupation_analysis(occupation_count, occupation_dict, certified_count))

TOP_OCCUPATIONS;NUMBER_CERTIFIED_APPLICATIONS;PERCENTAGE
COMPUTER SOFTWARE ENGINEERS, APPLICATIONS;108276;19.8%
COMPUTER SOFTWARE ENGINEERS, APPLICATIONS;88900;16.2%
SOFTWARE DEVELOPERS, APPLICATIONS;81165;14.8%
COMPUTER OCCUPATIONS, ALL OTHER: INFORMATION TECHN;53028;9.7%
SOFTWARE DEVELOPPERS, SYSTEMS SOFTWARE;15382;2.8%
MANAGEMENT ANALYSTS;12065;2.2%
OPERATIONS RESEARCH ANALYSTS;9915;1.8%
NETWORK AND COMPUTER SYSTEMS ADMINISTRATORS*;9621;1.8%
FINANCIAL ANALYSTS;8199;1.5%
DATABASE ADMINISTRATORS;7515;1.4%


In [320]:
def state_analysis(count, total):
    sorted_states = sorted(count.items(), key=lambda kv: kv[1])
    sorted_states.reverse()
    top_ten = sorted_states[:10]
    
    states_answer = ['TOP_STATES;NUMBER_CERTIFIED_APPLICATIONS;PERCENTAGE']
    for state in top_ten:
        row = []
        row.append(state[0])
        row.append(str(state[1]))
        row.append(ratio_formatted(state[1], total))
        states_answer.append(';'.join(row))
    return "\n".join(states_answer)

In [321]:
print(state_analysis(state_count, certified_count))

TOP_STATES;NUMBER_CERTIFIED_APPLICATIONS;PERCENTAGE
CA;86066;15.7%
TX;84626;15.5%
NJ;79950;14.6%
NY;37508;6.9%
IL;33119;6.1%
PA;23437;4.3%
MD;23115;4.2%
MI;19792;3.6%
VA;18338;3.4%
MA;17622;3.2%
