# Technical Job Group DataFrame

In [1]:
import pandas as pd
import numpy as np

In [2]:
titles = pd.read_csv('../data_files/Jobs_titles_2020.csv', index_col=[0], header=0)

In [3]:
titles

Unnamed: 0,0
0,All Occupations
1,Management Occupations
2,General and Operations Managers
3,Public Relations and Fundraising Managers
4,Administrative Services and Facilities Managers
...,...
729,Floor Sanders and Finishers
730,Manufactured Building and Mobile Home Installers
731,Agricultural Engineers
732,Astronomers


## Source

In [4]:
Demo_df = pd.read_csv('../data_files/metro_demographic_by_city_state.csv', index_col=[0], header=0)
BLS = pd.read_csv('../data_files/bls_metro_2020_clean.csv', index_col=[0], header=0)

## Prepare BLS_df with only these BLS columns

In [5]:
BLS_df = BLS[['area_title', 'occ_title', 'tot_emp', 'emp_prse', 'jobs_1000']]
BLS_df.head(100)

Unnamed: 0,area_title,occ_title,tot_emp,emp_prse,jobs_1000
0,"Abilene, TX",All Occupations,66060.0,1.9,1000.000
1,"Abilene, TX",Management Occupations,2910.0,4.5,44.068
2,"Abilene, TX",General and Operations Managers,1320.0,7.4,20.036
4,"Abilene, TX",Public Relations and Fundraising Managers,40.0,31.5,0.549
5,"Abilene, TX",Administrative Services and Facilities Managers,140.0,12.3,2.045
...,...,...,...,...,...
122,"Abilene, TX",Phlebotomists,120.0,25.2,1.816
123,"Abilene, TX","Healthcare Support Workers, All Other",80.0,37.1,1.270
124,"Abilene, TX",Protective Service Occupations,1830.0,1.6,27.738
125,"Abilene, TX",First-Line Supervisors of Correctional Officers,100.0,0.0,1.500


## Prepare Demographic_df

1. Drop city and state, already exist in BLS_df
2. Rename "area_name" to "area_title" to match BLS_df

In [6]:
Demo_df = Demo_df.drop(columns=["city","state"])
Demo_df = Demo_df.rename(columns={"area_name": "area_title"})
Demo_df.head()

Unnamed: 0,area_title,population,median_age,average_income,family_poverty,educational_attainment_bachelors,educational_attainment_graduate,educational_attainment_high_school,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other
0,"Abilene, TX",85075,27,49583,9368,12687,6935,25576,10738,29001,2253,53385,8597,18241,524,15,1860,200
1,"Akron, OH",94894,33,42943,25508,10769,5611,33755,13742,31013,2164,54490,31493,1761,134,22,4765,65
2,"Albany, GA",64884,37,59234,13830,9365,7235,18004,9037,21217,948,20685,40100,1737,96,24,1241,53
3,"Albuquerque, NM",319991,35,61589,42306,66219,56998,66455,25103,105387,8437,153880,9545,126499,13606,213,6795,1016
4,"Alexandria, LA",59184,38,57892,10955,7768,5315,21019,9059,16030,1856,23883,30783,1494,206,7,924,31


## Define occ_title group

These are our predefined group that we can pass into PostGres later. For now we pass into our **isin** filter next step.

For final presentation, these job_groups can be a select drop-down to see the same analysis run against several different job categories.

i.e.: Healthcare, Foodservice, Creative...

We don't have to be tied to just Technical jobs

In [7]:
job_groups = {
    'technical' : ['Management Occupations','Computer and Information Systems Managers', 'Computer and Mathematical Occupations', 'Computer Systems Analysts','Computer User Support Specialists', 'Network and Computer Systems Administrators', 'Software Developers and Software Quality Assurance Analysts and Testers', 'Computer Occupations, All Other', 'Computer Hardware Engineers', 'Information Security Analysts', 'Computer Network Support Specialists', 'Computer Network Architects', 'Database Administrators and Architects', 'Computer Programmers', 'Web Developers and Digital Interface Designers', 'Data Scientists and Mathematical Science Occupations, All Other', 'Computer Numerically Controlled Tool Operators', 'Computer Numerically Controlled Tool Programmers', 'Computer and Information Research Scientists' ],
    'healthcare' : ['Registered Nurses', 'Nurse Practitioners', 'Dental Hygienists', 'Clinical Laboratory Technologists and Technicians']
}

## Flag BLS row as istech if occ_title isin list

Creates a filtered DF where occ_title is in our Titles list defined above

In [8]:
#filteredBLS_df = BLS_df.loc[BLS_df['occ_title'].isin(job_groups['technical'])]

filteredBLS_df = BLS_df
filteredBLS_df['istech'] = np.where(filteredBLS_df["occ_title"].isin(job_groups['technical']), 1, 0)
filteredBLS_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredBLS_df['istech'] = np.where(filteredBLS_df["occ_title"].isin(job_groups['technical']), 1, 0)


Unnamed: 0,area_title,occ_title,tot_emp,emp_prse,jobs_1000,istech
0,"Abilene, TX",All Occupations,66060.0,1.9,1000.000,0
1,"Abilene, TX",Management Occupations,2910.0,4.5,44.068,1
2,"Abilene, TX",General and Operations Managers,1320.0,7.4,20.036,0
4,"Abilene, TX",Public Relations and Fundraising Managers,40.0,31.5,0.549,0
5,"Abilene, TX",Administrative Services and Facilities Managers,140.0,12.3,2.045,0
...,...,...,...,...,...,...
140045,"Worcester, MA-CT","Laborers and Freight, Stock, and Material Move...",4770.0,7.8,17.944,0
140046,"Worcester, MA-CT",Machine Feeders and Offbearers,180.0,17.5,0.660,0
140047,"Worcester, MA-CT","Packers and Packagers, Hand",1220.0,21.6,4.592,0
140048,"Worcester, MA-CT",Stockers and Order Fillers,4300.0,4.5,16.161,0


In [16]:
filteredBLS_df.head(10)

Unnamed: 0,area_title,occ_title,tot_emp,emp_prse,jobs_1000,istech
0,"Abilene, TX",All Occupations,66060.0,1.9,1000.0,0
1,"Abilene, TX",Management Occupations,2910.0,4.5,44.068,1
2,"Abilene, TX",General and Operations Managers,1320.0,7.4,20.036,0
4,"Abilene, TX",Public Relations and Fundraising Managers,40.0,31.5,0.549,0
5,"Abilene, TX",Administrative Services and Facilities Managers,140.0,12.3,2.045,0
6,"Abilene, TX",Computer and Information Systems Managers,70.0,30.2,1.054,1
8,"Abilene, TX",Industrial Production Managers,50.0,22.3,0.761,0
9,"Abilene, TX","Transportation, Storage, and Distribution Mana...",60.0,18.6,0.89,0
10,"Abilene, TX",Human Resources Managers,60.0,45.1,0.966,0
11,"Abilene, TX",Construction Managers,160.0,14.2,2.433,0


### Group by area_title

#### ToDo: properly calculate media_age, average_household_income, jobs_100 based on the population 

In [10]:
merged_grouped = filteredBLS_df.groupby(['area_title', 'istech'], as_index=False).agg(
    area_title = ('area_title', lambda x: x.iloc[0]),
    jobs_1000 = ('jobs_1000', 'mean'),
    tot_emp = ('tot_emp', 'sum'),
    emp_prse = ('emp_prse', 'mean')
    )
merged_grouped

Unnamed: 0,istech,area_title,jobs_1000,tot_emp,emp_prse
0,0,"Abilene, TX",12.048520,182230.0,18.251528
1,1,"Abilene, TX",8.409250,4450.0,22.500000
2,0,"Aguadilla-Isabela, PR",19.277743,113760.0,17.231618
3,1,"Aguadilla-Isabela, PR",14.907600,3240.0,13.120000
4,0,"Akron, OH",7.321090,859710.0,19.450928
...,...,...,...,...,...
787,1,"Youngstown-Warren-Boardman, OH-PA",3.957533,11480.0,24.280000
788,0,"Yuba City, CA",15.261576,114460.0,21.675294
789,1,"Yuba City, CA",6.723333,2680.0,16.644444
790,0,"Yuma, AZ",13.517143,166330.0,18.127586


### Merge the filtered BLS with the Demographic DF on 'area_title'

This will append demographic data to each BLS row matching on 'area_title'

Note: All of Abilene, TX has the same demographic data. Columns jobs_1000, tot_emp, and emp_prse are sums or means (math still needs work) from BLS rows. The demographic stat columns are identical because our demographic data is tied to area_title.


In [11]:
merged = pd.merge(merged_grouped, Demo_df, on=['area_title'])
merged.head(2)

Unnamed: 0,istech,area_title,jobs_1000,tot_emp,emp_prse,population,median_age,average_income,family_poverty,educational_attainment_bachelors,...,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other
0,0,"Abilene, TX",12.04852,182230.0,18.251528,85075,27,49583,9368,12687,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
1,1,"Abilene, TX",8.40925,4450.0,22.5,85075,27,49583,9368,12687,...,10738,29001,2253,53385,8597,18241,524,15,1860,200


# Final Dataframe 'DRAFT'

Much work to be done during aggregation. This current draft is enough to get us playing with models.

### Check our final DF

In [12]:
# Number of metropolitan areas and column count
merged.shape

(310, 22)

In [13]:
merged.dtypes

istech                                   int64
area_title                              object
jobs_1000                              float64
tot_emp                                float64
emp_prse                               float64
population                               int64
median_age                               int64
average_income                           int64
family_poverty                           int64
educational_attainment_bachelors         int64
educational_attainment_graduate          int64
educational_attainment_high_school       int64
educational_attainment_no_diploma        int64
educational_attainment_some_college      int64
race_asian                               int64
race_white                               int64
race_black                               int64
race_hispanic                            int64
race_native                              int64
race_islander                            int64
race_two                                 int64
race_other   

In [14]:
merged

Unnamed: 0,istech,area_title,jobs_1000,tot_emp,emp_prse,population,median_age,average_income,family_poverty,educational_attainment_bachelors,...,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other
0,0,"Abilene, TX",12.048520,182230.0,18.251528,85075,27,49583,9368,12687,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
1,1,"Abilene, TX",8.409250,4450.0,22.500000,85075,27,49583,9368,12687,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
2,0,"Akron, OH",7.321090,859710.0,19.450928,94894,33,42943,25508,10769,...,13742,31013,2164,54490,31493,1761,134,22,4765,65
3,1,"Akron, OH",6.404353,33910.0,18.882353,94894,33,42943,25508,10769,...,13742,31013,2164,54490,31493,1761,134,22,4765,65
4,0,"Albany, GA",13.685161,153970.0,18.692708,64884,37,59234,13830,9365,...,9037,21217,948,20685,40100,1737,96,24,1241,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,1,"Winston-Salem, NC",5.070765,22180.0,14.558824,33082,27,34263,1864,9344,...,1215,10993,1351,22926,5736,2099,57,0,846,67
306,0,"Yakima, WA",10.979800,252670.0,17.385490,46258,32,52787,7262,4255,...,12443,13692,285,20719,340,23274,494,10,1095,41
307,1,"Yakima, WA",4.708300,4250.0,13.860000,46258,32,52787,7262,4255,...,12443,13692,285,20719,340,23274,494,10,1095,41
308,0,"Yuba City, CA",15.261576,114460.0,21.675294,40959,33,60353,6881,4382,...,9092,14499,3892,19336,874,14832,258,195,1504,68


In [15]:
# write to csv
merged_grouped.to_csv('../data_files/TechnicalJobs_Not_DF_draft.csv')