# Technical Job Group DataFrame

In [31]:
import pandas as pd
import numpy as np

In [35]:
titles = pd.read_csv('../data_files/Jobs_titles_2020.csv', index_col=[0], header=0)

In [36]:
titles

Unnamed: 0,0
0,All Occupations
1,Management Occupations
2,General and Operations Managers
3,Public Relations and Fundraising Managers
4,Administrative Services and Facilities Managers
...,...
729,Floor Sanders and Finishers
730,Manufactured Building and Mobile Home Installers
731,Agricultural Engineers
732,Astronomers


## Source

In [3]:
Demo_df = pd.read_csv('../data_files/metro_demographic_by_city_state.csv', index_col=[0], header=0)
BLS = pd.read_csv('../data_files/bls_metro_2020_clean.csv', index_col=[0], header=0)

## Prepare BLS_df with only these BLS columns

In [4]:
BLS_df = BLS[['area_title', 'occ_title', 'tot_emp', 'emp_prse', 'jobs_1000']]
BLS_df.head()

Unnamed: 0,area_title,occ_title,tot_emp,emp_prse,jobs_1000
0,"Abilene, TX",All Occupations,66060.0,1.9,1000.0
1,"Abilene, TX",Management Occupations,2910.0,4.5,44.068
2,"Abilene, TX",General and Operations Managers,1320.0,7.4,20.036
4,"Abilene, TX",Public Relations and Fundraising Managers,40.0,31.5,0.549
5,"Abilene, TX",Administrative Services and Facilities Managers,140.0,12.3,2.045


## Prepare Demographic_df

1. Drop city and state, already exist in BLS_df
2. Rename "area_name" to "area_title" to match BLS_df

In [5]:
Demo_df = Demo_df.drop(columns=["city","state"])
Demo_df = Demo_df.rename(columns={"area_name": "area_title"})
Demo_df.head()

Unnamed: 0,area_title,population,median_age,average_income,family_poverty,educational_attainment_bachelors,educational_attainment_graduate,educational_attainment_high_school,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other
0,"Abilene, TX",85075,27,49583,9368,12687,6935,25576,10738,29001,2253,53385,8597,18241,524,15,1860,200
1,"Akron, OH",94894,33,42943,25508,10769,5611,33755,13742,31013,2164,54490,31493,1761,134,22,4765,65
2,"Albany, GA",64884,37,59234,13830,9365,7235,18004,9037,21217,948,20685,40100,1737,96,24,1241,53
3,"Albuquerque, NM",319991,35,61589,42306,66219,56998,66455,25103,105387,8437,153880,9545,126499,13606,213,6795,1016
4,"Alexandria, LA",59184,38,57892,10955,7768,5315,21019,9059,16030,1856,23883,30783,1494,206,7,924,31


## Define occ_title group

These are our predefined group that we can pass into PostGres later. For now we pass into our **isin** filter next step.

For final presentation, these job_groups can be a select drop-down to see the same analysis run against several different job categories.

i.e.: Healthcare, Foodservice, Creative...

We don't have to be tied to just Technical jobs

In [19]:
job_groups = {
    'technical' : ['Management Occupations','Computer and Information Systems Managers', 'Computer and Mathematical Occupations', 'Computer Systems Analysts','Computer User Support Specialists', 'Network and Computer Systems Administrators', 'Software Developers and Software Quality Assurance Analysts and Testers', 'Computer Occupations, All Other'],
    'healthcare' : ['Registered Nurses', 'Nurse Practitioners', 'Dental Hygienists', 'Clinical Laboratory Technologists and Technicians']
}

## Flag BLS row as istech if occ_title isin list

Creates a filtered DF where occ_title is in our Titles list defined above

In [39]:
#filteredBLS_df = BLS_df.loc[BLS_df['occ_title'].isin(job_groups['technical'])]

filteredBLS_df = BLS_df
filteredBLS_df['istech'] = np.where(filteredBLS_df["occ_title"].isin(job_groups['technical']), 1, 0)
filteredBLS_df 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredBLS_df['istech'] = np.where(filteredBLS_df["occ_title"].isin(job_groups['technical']), 1, 0)


Unnamed: 0,area_title,occ_title,tot_emp,emp_prse,jobs_1000,istech
0,"Abilene, TX",All Occupations,66060.0,1.9,1000.000,0
1,"Abilene, TX",Management Occupations,2910.0,4.5,44.068,1
2,"Abilene, TX",General and Operations Managers,1320.0,7.4,20.036,0
4,"Abilene, TX",Public Relations and Fundraising Managers,40.0,31.5,0.549,0
5,"Abilene, TX",Administrative Services and Facilities Managers,140.0,12.3,2.045,0
...,...,...,...,...,...,...
140045,"Worcester, MA-CT","Laborers and Freight, Stock, and Material Move...",4770.0,7.8,17.944,0
140046,"Worcester, MA-CT",Machine Feeders and Offbearers,180.0,17.5,0.660,0
140047,"Worcester, MA-CT","Packers and Packagers, Hand",1220.0,21.6,4.592,0
140048,"Worcester, MA-CT",Stockers and Order Fillers,4300.0,4.5,16.161,0


### Merge the filtered BLS with the Demographic DF on 'area_title'

This will append demographic data to each BLS row matching on 'area_title'

Note: All of Abilene, TX has the same demographic data. Each Abilene row represents a slice of the Abilene area. The demographic stats represents the entire area. So we need to group all of Abilene into a single row.

In [41]:
merged = pd.merge(filteredBLS_df, Demo_df, on=['area_title'])
merged

Unnamed: 0,area_title,occ_title,tot_emp,emp_prse,jobs_1000,istech,population,median_age,average_income,family_poverty,...,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other
0,"Abilene, TX",All Occupations,66060.0,1.9,1000.000,0,85075,27,49583,9368,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
1,"Abilene, TX",Management Occupations,2910.0,4.5,44.068,1,85075,27,49583,9368,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
2,"Abilene, TX",General and Operations Managers,1320.0,7.4,20.036,0,85075,27,49583,9368,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
3,"Abilene, TX",Public Relations and Fundraising Managers,40.0,31.5,0.549,0,85075,27,49583,9368,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
4,"Abilene, TX",Administrative Services and Facilities Managers,140.0,12.3,2.045,0,85075,27,49583,9368,...,10738,29001,2253,53385,8597,18241,524,15,1860,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43146,"Waterbury, CT",Cleaners of Vehicles and Equipment,110.0,29.0,1.738,0,80603,37,43669,18030,...,18980,22290,2386,26320,15617,33398,12,41,2413,416
43147,"Waterbury, CT","Laborers and Freight, Stock, and Material Move...",820.0,8.7,12.987,0,80603,37,43669,18030,...,18980,22290,2386,26320,15617,33398,12,41,2413,416
43148,"Waterbury, CT","Packers and Packagers, Hand",230.0,24.3,3.677,0,80603,37,43669,18030,...,18980,22290,2386,26320,15617,33398,12,41,2413,416
43149,"Waterbury, CT",Stockers and Order Fillers,950.0,16.3,15.074,0,80603,37,43669,18030,...,18980,22290,2386,26320,15617,33398,12,41,2413,416


### Group by area_title

#### ToDo: properly calculate media_age, average_household_income, jobs_100 based on the population 

In [26]:
merged_grouped = merged.groupby(['area_title', 'istech'], as_index=False).agg(
    area_title = ('area_title', lambda x: x.iloc[0]),
    jobs_1000 = ('jobs_1000', 'sum'),
    tot_emp = ('tot_emp', 'sum'),
    emp_prse = ('emp_prse', 'sum'),
    population = ('population', 'sum'),
    median_age = ('median_age', 'mean'),
    average_household_income = ('average_income', 'mean'),
    family_poverty = ('family_poverty', 'sum'),
    educational_attainment_bachelors = ('educational_attainment_bachelors', 'sum'),
    educational_attainment_graduate = ('educational_attainment_graduate', 'sum'),
    educational_attainment_high_school = ('educational_attainment_high_school', 'sum'),
    educational_attainment_no_diploma = ('educational_attainment_no_diploma', 'sum'),
    educational_attainment_some_college = ('educational_attainment_some_college', 'sum'),
    race_asian = ('race_asian', 'sum'),
    race_white = ('race_white', 'sum'),
    race_black = ('race_black', 'sum'),
    race_hispanic = ('race_hispanic', 'sum'),
    race_native = ('race_native', 'sum'),
    race_islander = ('race_islander', 'sum'),
    race_two = ('race_two', 'sum'),
    race_other = ('race_other', 'sum'))

# Final Dataframe 'DRAFT'

Much work to be done during aggregation. This current draft is enough to get us playing with models.

### Check our final DF

In [27]:
# Number of metropolitan areas and column count
merged_grouped.shape

(310, 22)

In [28]:
merged_grouped.dtypes

istech                                   int64
area_title                              object
jobs_1000                              float64
tot_emp                                float64
emp_prse                               float64
population                               int64
median_age                               int64
average_household_income                 int64
family_poverty                           int64
educational_attainment_bachelors         int64
educational_attainment_graduate          int64
educational_attainment_high_school       int64
educational_attainment_no_diploma        int64
educational_attainment_some_college      int64
race_asian                               int64
race_white                               int64
race_black                               int64
race_hispanic                            int64
race_native                              int64
race_islander                            int64
race_two                                 int64
race_other   

In [29]:
merged_grouped.head(10)

Unnamed: 0,istech,area_title,jobs_1000,tot_emp,emp_prse,population,median_age,average_household_income,family_poverty,educational_attainment_bachelors,...,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other
0,0,"Abilene, TX",2759.111,182230.0,4179.6,19482175,27,49583,2145272,2905323,...,2459002,6641229,515937,12225165,1968713,4177189,119996,3435,425940,45800
1,1,"Abilene, TX",67.274,4450.0,180.0,680600,27,49583,74944,101496,...,85904,232008,18024,427080,68776,145928,4192,120,14880,1600
2,0,"Akron, OH",2768.697,862410.0,7578.7,36723978,33,42943,9871596,4167603,...,5318154,12002031,837468,21087630,12187791,681507,51858,8514,1844055,25155
3,1,"Akron, OH",100.228,31210.0,75.3,664258,33,42943,178556,75383,...,96194,217091,15148,381430,220451,12327,938,154,33355,455
4,0,"Albany, GA",2627.551,153970.0,3589.0,12457728,37,59234,2655360,1798080,...,1735104,4073664,182016,3971520,7699200,333504,18432,4608,238272,10176
5,1,"Albany, GA",70.332,4120.0,114.9,454188,37,59234,96810,65555,...,63259,148519,6636,144795,280700,12159,672,168,8687,371
6,0,"Albuquerque, NM",2787.072,1043470.0,7033.3,137276139,35,61589,18149274,28407951,...,10769187,45211023,3619473,66014520,4094805,54268071,5836974,91377,2915055,435864
7,1,"Albuquerque, NM",92.397,34590.0,80.7,2559928,35,61589,338448,529752,...,200824,843096,67496,1231040,76360,1011992,108848,1704,54360,8128
8,0,"Alexandria, LA",2717.298,155230.0,3833.3,11955168,38,57892,2212910,1569136,...,1829918,3238060,374912,4824366,6218166,301788,41612,1414,186648,6262
9,1,"Alexandria, LA",50.144,2860.0,30.0,177552,38,57892,32865,23304,...,27177,48090,5568,71649,92349,4482,618,21,2772,93


In [12]:
# write to csv
merged_grouped.to_csv('../data_files/TechnicalJobs_Not_DF_draft.csv')