# Technical Job Group DataFrame

In [1]:
import pandas as pd

## Source

In [2]:
Demo_df = pd.read_csv('../data_files/metro_demographic_by_city_state.csv', index_col=[0], header=0)
BLS = pd.read_csv('../data_files/bls_50metro_2020_clean.csv', index_col=[0], header=0)

## Prepare BLS_df with only these BLS columns

In [3]:
BLS_df = BLS[['area_title', 'occ_title', 'tot_emp', 'emp_prse', 'jobs_1000']]
BLS_df.head()

Unnamed: 0,area_title,occ_title,tot_emp,emp_prse,jobs_1000
0,"Abilene, TX",All Occupations,66060.0,1.9,1000.0
1,"Abilene, TX",Management Occupations,2910.0,4.5,44.068
2,"Abilene, TX",General and Operations Managers,1320.0,7.4,20.036
4,"Abilene, TX",Public Relations and Fundraising Managers,40.0,31.5,0.549
5,"Abilene, TX",Administrative Services and Facilities Managers,140.0,12.3,2.045


## Prepare Demographic_df

1. Drop city and state, already exist in BLS_df
2. Rename "area_name" to "area_title" to match BLS_df

In [4]:
Demo_df = Demo_df.drop(columns=["city","state"])
Demo_df = Demo_df.rename(columns={"area_name": "area_title"})
Demo_df.head()

Unnamed: 0,area_title,population,median_age,average_income,family_poverty,educational_attainment_bachelors,educational_attainment_graduate,educational_attainment_high_school,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other
0,"Abilene, TX",85075,27,49583,9368,12687,6935,25576,10738,29001,2253,53385,8597,18241,524,15,1860,200
1,"Akron, OH",94894,33,42943,25508,10769,5611,33755,13742,31013,2164,54490,31493,1761,134,22,4765,65
2,"Albany, GA",64884,37,59234,13830,9365,7235,18004,9037,21217,948,20685,40100,1737,96,24,1241,53
3,"Albuquerque, NM",319991,35,61589,42306,66219,56998,66455,25103,105387,8437,153880,9545,126499,13606,213,6795,1016
4,"Alexandria, LA",59184,38,57892,10955,7768,5315,21019,9059,16030,1856,23883,30783,1494,206,7,924,31


## Define occ_title group

These are our predefined group that we can pass into PostGres later. For now we pass into our **isin** filter next step.

For final presentation, these job_groups can be a select drop-down to see the same analysis run against several different job categories.

i.e.: Healthcare, Foodservice, Creative...

We don't have to be tied to just Technical jobs

In [5]:
job_groups = {
    'technical' : ['Computer and Information Systems Managers', 'Computer and Mathematical Occupations', 'Computer Systems Analysts','Computer User Support Specialists', 'Network and Computer Systems Administrators', 'Software Developers and Software Quality Assurance Analysts and Testers', 'Computer Occupations, All Other'],
    'healthcare' : ['Registered Nurses', 'Nurse Practitioners', 'Dental Hygienists', 'Clinical Laboratory Technologists and Technicians']
}

## Pass in the list containing the occ_titles we wish to have returned

Creates a filtered DF where occ_title is in our Titles list defined above

In [6]:
filteredBLS_df = BLS_df.loc[BLS_df['occ_title'].isin(job_groups['technical'])]

### Merge the filtered BLS with the Demographic DF on 'area_title'

This will append demographic data to each BLS row matching on 'area_title'

Note: All of Abilene, TX has the same demographic data. Each Abilene row represents a slice of the Abilene area. The demographic stats represents the entire area. So we need to group all of Abilene into a single row.

In [7]:
merged = pd.merge(filteredBLS_df, Demo_df, on=['area_title'])
merged.head()

Unnamed: 0,area_title,occ_title,tot_emp,emp_prse,jobs_1000,population,median_age,average_income,family_poverty,educational_attainment_bachelors,...,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other
0,"Abilene, TX",Computer and Information Systems Managers,70.0,30.2,1.054,85075,27,49583,9368,12687,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
1,"Abilene, TX",Computer and Mathematical Occupations,820.0,15.3,12.372,85075,27,49583,9368,12687,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
2,"Abilene, TX",Computer Systems Analysts,110.0,23.0,1.674,85075,27,49583,9368,12687,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
3,"Abilene, TX",Computer User Support Specialists,220.0,14.5,3.374,85075,27,49583,9368,12687,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
4,"Abilene, TX",Network and Computer Systems Administrators,150.0,28.1,2.227,85075,27,49583,9368,12687,...,10738,29001,2253,53385,8597,18241,524,15,1860,200


### Group by area_title

#### ToDo: properly calculate media_age, average_household_income, jobs_100 based on the population 

In [8]:
merged_grouped = merged.groupby(['area_title'], as_index=False).agg(
    area_title = ('area_title', lambda x: x.iloc[0]),
    jobs_1000 = ('jobs_1000', 'sum'),
    tot_emp = ('tot_emp', 'sum'),
    emp_prse = ('emp_prse', 'sum'),
    population = ('population', 'sum'),
    median_age = ('median_age', 'mean'),
    average_household_income = ('average_income', 'mean'),
    family_poverty = ('family_poverty', 'sum'),
    educational_attainment_bachelors = ('educational_attainment_bachelors', 'sum'),
    educational_attainment_graduate = ('educational_attainment_graduate', 'sum'),
    educational_attainment_high_school = ('educational_attainment_high_school', 'sum'),
    educational_attainment_no_diploma = ('educational_attainment_no_diploma', 'sum'),
    educational_attainment_some_college = ('educational_attainment_some_college', 'sum'),
    race_asian = ('race_asian', 'sum'),
    race_white = ('race_white', 'sum'),
    race_black = ('race_black', 'sum'),
    race_hispanic = ('race_hispanic', 'sum'),
    race_native = ('race_native', 'sum'),
    race_islander = ('race_islander', 'sum'),
    race_two = ('race_two', 'sum'),
    race_other = ('race_other', 'sum'))

# Final Dataframe 'DRAFT'

Much work to be done during aggregation. This current draft is enough to get us playing with models.

### Check our final DF

In [9]:
# Number of metropolitan areas and column count
merged_grouped.shape

(155, 21)

In [10]:
merged_grouped.dtypes

area_title                              object
jobs_1000                              float64
tot_emp                                float64
emp_prse                               float64
population                               int64
median_age                               int64
average_household_income                 int64
family_poverty                           int64
educational_attainment_bachelors         int64
educational_attainment_graduate          int64
educational_attainment_high_school       int64
educational_attainment_no_diploma        int64
educational_attainment_some_college      int64
race_asian                               int64
race_white                               int64
race_black                               int64
race_hispanic                            int64
race_native                              int64
race_islander                            int64
race_two                                 int64
race_other                               int64
dtype: object

In [11]:
merged_grouped.head(10)

Unnamed: 0,area_title,jobs_1000,tot_emp,emp_prse,population,median_age,average_household_income,family_poverty,educational_attainment_bachelors,educational_attainment_graduate,...,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other
0,"Abilene, TX",23.206,1540.0,175.5,595525,27,49583,65576,88809,48545,...,75166,203007,15771,373695,60179,127687,3668,105,13020,1400
1,"Akron, OH",48.893,15220.0,72.4,569364,33,42943,153048,64614,33666,...,82452,186078,12984,326940,188958,10566,804,132,28590,390
2,"Albany, GA",20.677,1210.0,110.2,389304,37,59234,82980,56190,43410,...,54222,127302,5688,124110,240600,10422,576,144,7446,318
3,"Albuquerque, NM",44.266,16570.0,78.3,2239937,35,61589,296142,463533,398986,...,175721,737709,59059,1077160,66815,885493,95242,1491,47565,7112
4,"Alexandria, LA",7.441,420.0,25.1,118368,38,57892,21910,15536,10630,...,18118,32060,3712,47766,61566,2988,412,14,1848,62
5,"Amarillo, TX",19.341,2190.0,79.6,1378349,35,67009,187187,191191,91063,...,247982,475615,49308,764267,95760,432621,4501,1015,29624,1253
6,"Ames, IA",64.999,2830.0,198.3,484323,22,35781,16394,136955,137018,...,10780,97230,47691,391027,11781,16464,1225,238,15799,98
7,"Anchorage, AK",31.069,5110.0,81.2,6069567,33,102234,334110,1334907,765198,...,403557,2140761,566433,3553074,328545,543333,420504,145761,497259,14658
8,"Ann Arbor, MI",83.208,17330.0,79.0,904190,29,74733,39249,272454,376488,...,26614,156044,108885,659127,62020,33964,1617,371,36855,1351
9,"Asheville, NC",22.527,4150.0,61.5,268704,44,65450,21930,59694,33468,...,26484,78798,636,236526,9666,17484,510,294,3474,114


In [12]:
# write to csv
merged_grouped.to_csv('../data_files/TechnicalJobs_DF_draft.csv')