In [181]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
import concurrent.futures
import us
from ipumspy import readers
from scripts.clean_ipums import get_CPS
import plotly.graph_objects as go
import re

# Final Analysis for Research Brief

## Data

We will be requesting monthly CPS data from 1990 to 2024, with variables on economic and educational outcomes, as well as information on demographic characteristics, like race, geography, and disability and parental status. 

Note that this will take some time, given the number of datasets we're requesting from IPUMS. We're multithreading though, which should speed this up quite a bit.

In [None]:
load_dotenv() # using a .env file
              # place your API key as IPUMS_API_KEY="xxx"
os.makedirs('finaldata',exist_ok=True) # make dir for files

my_vars = {'AGE', 'SEX', 'RACE', 'HISPAN', 'NCHILD', 
           'EMPSTAT', 'LABFORCE', 'SCHLCOLL', 'STATEFIP'}  # vars set
start,end = 1990,2024 # year range
# years < 2008 don't have 'DIFFANY', which causes exceptions when requesting data
vars_dict = {yr: my_vars if yr < 2008 
                         else my_vars | {'DIFFANY'} # add 'DIFFANY' to set
                                for yr in range(start,end+1)} 
#submit multiple requests
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as exe: 
    future_to_year = {exe.submit(get_CPS, # request function
                                 yr, # year
                                 vars_dict[yr], # variables for a given year
                                 f'NEET_{yr}', # file name
                                 'finaldata'): # data directory 
                                              yr for yr in range(start,end+1)} # single year CPS data request
    for future in concurrent.futures.as_completed(future_to_year):
        yr = future_to_year[future] # year to print exceptions
        try:
            result = future.result()
        except Exception as exc:
            print(f'Year {yr} generated an exception.')

Let's now load the data in. We can make some specifications which should speed this up a bit as well:
- only respondents aged 16-24 years-old
- only respondents not surveyed in June, July and August
- only respondents not part of the ASEC
- only respondents with recorded enrollment and employment statuses

In [20]:
# there's definitely a quicker way to do this, 
# but I wrote it quickly and it loads in about 5 min for me, fine for now
# will update at some point, if we need to use more cps data in the future
import warnings
warnings.filterwarnings(action = 'ignore', 
                        category = readers.CitationWarning) # catch warning output so output doesn't get cluttered

months = [i for i in range(1,13) if i not in [6,7,8]] # exclude summer months
timetrends = pd.DataFrame() # init empty df

for yr in range(start,end+1):
    ddi_name = os.path.join('finaldata',f'NEET_{yr}.xml')
    ddi = readers.read_ipums_ddi(ddi_name)

    dat_name = os.path.join('finaldata',f'NEET_{yr}.dat.gz')
    iter_df = readers.read_microdata_chunked(ddi,
                                             filename=dat_name,
                                             chunksize=10000)
    
    df = pd.concat([df[(df['AGE'] >= 16) &  
                           (df['AGE'] <= 24) & 
                           (df['MONTH'].isin(months)) &
                           ((df['ASECFLAG'] == 2) | (df['ASECFLAG'].isnull())) &
                           (df['EMPSTAT'].isin(range(10,37))) &
                           (df['SCHLCOLL'].isin(range(1,6)))] for df in iter_df],
                           ignore_index=True) # load in year df, with specifications
    timetrends = pd.concat(objs=(timetrends,
                           df),
                           ignore_index=True) # concat master df with yr df

In [21]:
timetrends['ASECFLAG'].value_counts(dropna=False)

ASECFLAG
<NA>    4094808
2        505280
Name: count, dtype: Int64

Let's quickly check to see if everything looks right.

In [22]:
timetrends['YEAR'].value_counts().sort_index()

YEAR
1990    167253
1991    162629
1992    156885
1993    151236
1994    146105
1995    140486
1996    125573
1997    126238
1998    125980
1999    127317
2000    128052
2001    137020
2002    147484
2003    146914
2004    145181
2005    144497
2006    144000
2007    141154
2008    139853
2009    140537
2010    140636
2011    137225
2012    133728
2013    131956
2014    131935
2015    129468
2016    127044
2017    123214
2018    116837
2019    110647
2020    100619
2021     97712
2022     92702
2023     90768
2024     91203
Name: count, dtype: Int64

In [23]:
timetrends['MONTH'].value_counts().sort_index()

MONTH
1     515986
2     512933
3     505280
4     513984
5     512350
9     510744
10    511733
11    510247
12    506831
Name: count, dtype: Int64

In [24]:
timetrends['AGE'].value_counts().sort_index()

AGE
16    573079
17    566895
18    521789
19    487507
20    483386
21    486910
22    488240
23    492329
24    499953
Name: count, dtype: Int64

Okay, that all looks good. Let's map our variables now.

In [25]:
# NEET
timetrends['NEET'] = 'not_neet'
timetrends.loc[(timetrends['EMPSTAT'].isin(range(20,37))) & (timetrends['SCHLCOLL'] == 5), 'NEET'] = 'neet' # NEET conditions
# NLE (NEET, but now only those not in school and not in labor force)
timetrends['NLE'] = 'not_neet'
timetrends.loc[(timetrends['LABFORCE'] == 1) & (timetrends['SCHLCOLL'] == 5), 'NLE'] = 'neet'
#Gender
gender_codes = {1 : 'men', 2 : 'women'} 
timetrends['sex'] = timetrends['SEX'].map(gender_codes)
# Race
race_codes = [
    ((timetrends['RACE'] == 300) & (timetrends['HISPAN'] == 0)),
    ((timetrends['RACE'] == 100) & (timetrends['HISPAN'] == 0)),
    ((timetrends['RACE'] == 200) & (timetrends['HISPAN'] == 0)),
    ((timetrends['RACE'].isin(range(650, 653))) & (timetrends['HISPAN'] == 0)),
    ((timetrends['HISPAN'] > 0) & (timetrends['HISPAN'] < 902))
]
race_choices = ['aian', 'white', 'black', 'asian', 'hispanic']
timetrends['race_cat'] = np.select(race_codes, race_choices, default = 'other')
# State
state_names = us.states.mapping('fips', 'name') # dictionary with STATEFIP as key
state_names['11'] = 'District of Columbia' # DC isn't included, so manually add it
timetrends['state'] = timetrends['STATEFIP'].astype(str).str.zfill(2) 
timetrends['state'] = timetrends['state'].map(state_names)
# Kids
child_dict = {i:'has_child' if i > 0 else 'no_child' for i in range(10)}
timetrends['kids'] = timetrends['NCHILD'].map(child_dict)
# Disability
disability_dict = {1 : 'no_dis', 2 : 'has_dis'}
timetrends['dis'] = timetrends['DIFFANY'].map(disability_dict)
# Looking for work status
timetrends['looking_for_work'] = ''
timetrends.loc[timetrends['EMPSTAT'].isin(range(10,13)), 'looking_for_work'] = 'employed'
timetrends.loc[timetrends['EMPSTAT'].isin(range(20,23)), 'looking_for_work'] = 'looking'
timetrends.loc[timetrends['EMPSTAT'].isin(range(30,37)), 'looking_for_work'] = 'not_looking'

Let's get our NEET calculator function now.

In [26]:
def get_neet(df=timetrends,
             query_str=None,
             groupings=[]):
    '''returns NEET rates'''
    if query_str:
        df = df.query(query_str)
    agg_wt = df.groupby(groupings)['WTFINL'].sum().unstack() # agg sum
    agg_wt= agg_wt.eval('neet_rate = neet / (neet + not_neet) * 100')['neet_rate'] # NEET rates
    return agg_wt

## Analysis

### 1. Overall Time Trend

In [27]:
overall = get_neet(groupings=['YEAR', 'sex', 'NEET']).reindex().unstack()
overall

sex,men,women
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
1990,10.017256,18.554956
1991,11.501997,19.691814
1992,11.518769,18.98212
1993,10.82505,18.30388
1994,10.695621,17.644251
1995,10.512726,17.586979
1996,10.425318,16.629541
1997,10.019295,15.197452
1998,9.366412,14.474526
1999,9.280989,14.400116


In [28]:
fig = go.Figure()
colors = {'men':'skyblue','women':'pink'}
for i in ['men', 'women']:
    fig.add_trace(go.Scatter(
    x=overall.index, y=overall[i].round(2),
    mode='lines+markers',
    name=i.title(),
    line={
        'color': colors[i],
        'width': 3
    }
))
fig.update_layout(
    font={'family': 'Georgia, serif'},
    title={'text': '<b>NEET rates for 16-24 y.o. (1990-2024)</b>',
           'subtitle': {'text': '(<b>excluding summer months</b>)'}},
    xaxis={'title':{'text': '<b>Year'}},
    yaxis={'title':{'text': '<b>NEET rate (%)'}},
    legend = {'orientation': 'h'},
    width=650, height=400,
    template='plotly_white',
    margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 100}
)
fig.update_yaxes(range=[5,25])
fig.show()

### 2. By Disability and Parental Status

#### Disability

In [29]:
disability = get_neet(groupings=['YEAR','dis', 'sex', 'NEET']).unstack().unstack()
disability

sex,men,men,women,women
dis,has_dis,no_dis,has_dis,no_dis
YEAR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2008,35.383201,11.490264,31.522891,13.810852
2009,37.390557,13.507928,36.844796,14.496779
2010,36.908343,13.968382,37.937467,14.622856
2011,41.337963,12.571788,36.292516,14.421134
2012,39.829024,11.959576,36.648988,13.91066
2013,40.311276,12.823393,39.476145,14.720988
2014,38.80265,11.920432,37.664481,13.938124
2015,38.138646,10.994551,36.490799,13.053425
2016,36.59179,10.609934,38.695747,12.024602
2017,37.037077,9.638086,33.962683,11.201026


In [30]:
fig = go.Figure()
colors_dis = {('men', 'has_dis') : 'darkblue', ('men', 'no_dis'): 'teal',
          ('women', 'has_dis'): 'darkred', ('women', 'no_dis'): 'salmon'}
for i in [('men', 'has_dis'), ('men', 'no_dis'),
          ('women', 'has_dis'), ('women', 'no_dis')]:
    fig.add_trace(go.Scatter(
    x=disability.index, y=disability[i].round(2),
    mode='lines+markers',
    name=str(i),
    line={
        'color': colors_dis[i],
        'width': 3
    }
))
fig.update_layout(
    font={'family': 'Georgia, serif'},
    title={'text': '<b>NEET rates for 16-24 y.o. by disability status (2008-2024)</b>',
           'subtitle': {'text': '(<b>excluding summer months</b>)'}},
    xaxis={'title':{'text': '<b>Year'}}, yaxis={'title':{'text': '<b>NEET rate (%)'}},
    legend = {'orientation': 'v', 'title': {'text': '<b>group</b><br>(click to toggle lines)'}},
    width=700, height=450,
    template='plotly_white',
    margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 100}
)
fig.update_yaxes(range=[0,45])
fig.show()

#### Parent Status

In [31]:
parents = get_neet(groupings=['YEAR','kids', 'sex', 'NEET']).unstack().unstack()
parents

sex,men,men,women,women
kids,has_child,no_child,has_child,no_child
YEAR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1990,12.682835,9.825847,51.240403,10.047968
1991,14.447692,11.297857,53.760222,10.758971
1992,15.510384,11.23848,51.506941,10.384993
1993,14.324418,10.574446,49.096433,10.062685
1994,13.164635,10.511311,46.836758,9.672197
1995,13.748654,10.247533,45.963538,9.667492
1996,13.434661,10.168428,42.756678,9.589749
1997,12.738879,9.815989,39.909948,9.006265
1998,11.762226,9.185394,38.724209,8.450738
1999,12.24281,9.057339,38.10843,8.660436


In [32]:
fig = go.Figure()
colors_parents = {('men', 'has_child') : 'darkblue', ('men', 'no_child'): 'teal',
          ('women', 'has_child'): 'darkred', ('women', 'no_child'): 'salmon'}
for i in [('men', 'has_child'), ('men', 'no_child'),
          ('women', 'has_child'), ('women', 'no_child')]:
    fig.add_trace(go.Scatter(
    x=parents.index, y=parents[i].round(2),
    mode='lines+markers',
    name=str(i),
    line={
        'color': colors_parents[i],
        'width': 3
    }
))
fig.update_layout(
    font={'family': 'Georgia, serif'},
    title={'text': '<b>NEET rates for 16-24 y.o. by parental status (1990-2024)</b>',
           'subtitle': {'text': '(<b>excluding summer months</b>)'}},
    xaxis={'title':{'text': '<b>Year'}}, yaxis={'title':{'text': '<b>NEET rate (%)'}},
    legend = {'orientation': 'v', 'title': {'text': '<b>group</b><br>(click to toggle lines)'}},
    width=700, height=450,
    template='plotly_white',
    margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 100}
)
fig.update_yaxes(range=[0,55])
fig.show()

### 3. By Race

In [33]:
race = get_neet(groupings=['YEAR', 'race_cat', 'sex', 'NEET']).unstack().unstack()
race

sex,men,men,men,men,men,men,women,women,women,women,women,women
race_cat,aian,asian,black,hispanic,other,white,aian,asian,black,hispanic,other,white
YEAR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1990,17.792061,6.088405,19.42446,12.196839,8.286094,7.851776,28.946565,12.246194,28.6311,30.824047,12.052036,14.283629
1991,16.483241,7.679942,21.526662,14.435081,11.583538,9.019072,32.591962,14.559935,30.000238,31.563147,18.751884,15.223325
1992,17.195371,7.144802,22.002682,15.105539,14.590362,8.787811,31.072298,13.8638,29.498148,30.691338,25.411884,14.387876
1993,14.737228,6.680116,20.802851,13.984624,11.611785,8.259422,32.492373,13.758611,27.065046,31.834467,15.510439,13.555402
1994,15.303534,6.242215,19.754737,13.54542,13.252633,8.356211,26.106834,11.244947,25.016533,31.537457,22.784708,13.417921
1995,17.434698,5.826515,19.904843,14.114436,9.602006,7.968466,21.210123,13.481402,23.902497,31.889579,15.395688,13.492568
1996,17.679509,6.999938,19.731376,13.625734,17.360115,7.877675,21.834056,12.408812,23.21799,29.969848,10.380543,12.530164
1997,18.918615,6.412437,21.206018,12.417704,14.274826,7.161822,28.796965,11.422062,21.387577,25.886326,18.09019,11.533193
1998,17.84263,7.631807,18.444077,11.843369,10.421197,6.889832,28.058792,8.663242,19.246553,25.443863,20.757087,11.072472
1999,18.67859,7.22905,17.8814,11.552977,8.890147,6.98945,22.73404,9.188241,18.455769,26.193738,11.876421,11.047331


In [34]:
fig = go.Figure()
# colors_parents = {('men', 'has_child') : 'darkblue', ('men', 'no_child'): 'teal',
#           ('women', 'has_child'): 'darkred', ('women', 'no_child'): 'salmon'}
race_cols = [(s,r) for s in ['men','women'] for r in ['white','black','asian','hispanic']]
for i in race_cols:
    fig.add_trace(go.Scatter(
    x=race.index, y=race[i].round(2),
    mode='lines+markers',
    name=str(i),
    line={
        'width': 3
    }
))
fig.update_layout(
    font={'family': 'Georgia, serif'},
    title={'text': '<b>NEET rates for 16-24 y.o. by race (1990-2024)</b>',
           'subtitle': {'text': '(<b>excluding summer months</b>)'}},
    xaxis={'title':{'text': '<b>Year'}}, yaxis={'title':{'text': '<b>NEET rate (%)'}},
    legend = {'orientation': 'v', 'title': {'text': '<b>group</b><br>(click to toggle lines)'}},
    width=800, height=500,
    template='plotly_white',
    margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 100}
)
fig.update_yaxes(range=[0,35])
fig.show()

### 4. By State

In [35]:
states = get_neet(groupings=['YEAR', 'state', 'sex', 'NEET']).unstack().unstack()
states

sex,men,men,men,men,men,men,men,men,men,men,...,women,women,women,women,women,women,women,women,women,women
state,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
YEAR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1990,17.789268,13.022989,10.059219,12.342674,10.690877,5.876648,9.112945,10.004748,12.339956,9.60494,...,11.378314,21.614681,21.539057,15.071003,14.243732,12.599972,17.098819,28.250846,8.481596,16.616566
1991,11.663873,12.982497,12.172687,10.351329,11.44855,10.175277,8.094354,13.335176,16.793386,10.194651,...,14.267746,22.657062,22.346719,12.486357,13.926093,16.748431,18.26651,31.528908,11.336364,18.468111
1992,12.384915,13.447323,12.435461,9.905128,13.086376,8.351902,12.344283,8.70518,18.748204,10.899828,...,11.679153,20.090656,22.012445,14.518012,10.817444,17.184255,16.229321,30.541229,11.087206,15.09017
1993,11.685396,13.70549,9.523923,12.639078,12.891591,10.481227,6.910364,10.468923,21.047283,10.288782,...,10.286613,24.544091,20.322224,9.672224,13.677609,14.513299,17.061825,31.178731,10.266209,14.711009
1994,13.639536,11.499796,9.396569,10.112075,11.630305,9.935657,6.684156,8.525706,17.346486,11.951006,...,9.783449,18.214721,21.100333,12.849798,13.691108,13.983891,18.238544,28.842993,11.770656,19.527381
1995,13.732239,11.559305,10.584245,8.833456,11.349198,7.229902,7.908366,10.757934,18.863394,11.679286,...,10.343264,20.931518,22.060431,13.488495,10.002356,14.37194,18.879716,25.306337,10.225357,15.658386
1996,12.198311,17.469593,8.719728,9.626303,10.843127,7.525818,8.352465,11.013097,16.893991,10.676622,...,9.552327,23.846975,20.398258,13.725827,9.644935,12.742118,18.578542,21.935146,7.651578,14.713319
1997,10.101662,11.984272,8.228975,12.934259,10.473096,7.435272,10.723387,7.900764,18.669108,9.628984,...,9.191642,17.8494,19.926939,12.339494,11.148554,13.403425,17.242176,20.336519,7.206106,16.268701
1998,10.409624,15.237285,10.162739,14.290726,10.715124,6.043558,10.929228,9.015923,19.117288,8.32183,...,8.147759,17.235852,20.211256,11.53637,9.553135,13.170305,15.019358,20.046147,5.879988,15.409041
1999,11.527018,11.048588,9.279325,13.895766,9.771514,7.561349,9.957125,9.568248,13.592445,8.413869,...,9.381097,13.238876,18.946305,12.646978,10.267651,11.51612,14.016034,21.768976,7.420473,12.233364


In [36]:
# let's just plot a couple states
from plotly.subplots import make_subplots
states_and_coords = {'California': (1,1),'New York': (1,2),
                     'West Virginia': (2,1),'Alaska': (2,2)}
fig = make_subplots(rows=2,cols=2, subplot_titles=tuple(states_and_coords.keys()))
for i in states_and_coords.keys():
    fig.add_trace(go.Scatter(
    x=states.index, y=states[('men', i)].round(2),
    mode='lines+markers',
    name='Men',
    line={
        'width': 3, 'color': 'skyblue'
    }
),
row=states_and_coords[i][0],col=states_and_coords[i][1])
    
    fig.add_trace(go.Scatter(
    x=states.index, y=states[('women', i)].round(2),
    mode='lines+markers',
    name='Women',
    line={
        'width': 3, 'color': 'pink'
    }
),
row=states_and_coords[i][0],col=states_and_coords[i][1])
fig.update_layout(
    font={'family': 'Georgia, serif'},
    title={'text': '<b>NEET rates for 16-24 y.o. by state (1990-2024)</b>',
           'subtitle': {'text': '(<b>excluding summer months</b>)'}},
    showlegend=False,
    width=750, height=500,
    template='plotly_white',
    margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 100}
)
fig.update_yaxes(range=[0,35])
fig.show()

### 5. By NEET Status

Here, we'll break down the male NEETs by those who are looking for work, and those who are not looking for work.

In [54]:
weighted_sums = timetrends.query('sex == "men"').groupby('YEAR')['WTFINL'].sum()
neets_work = timetrends.query('sex == "men" and NEET == "neet"').copy()
neets_work = neets_work.groupby(['YEAR','looking_for_work'])['WTFINL'].sum().unstack()
neets_work['looking_rate'] = neets_work['looking'] / weighted_sums * 100
neets_work['not_looking_rate'] = neets_work['not_looking'] / weighted_sums * 100
neets_work = neets_work.reindex(columns=['looking_rate', 'not_looking_rate'])
neets_work

looking_for_work,looking_rate,not_looking_rate
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
1990,5.561085,4.456172
1991,6.602031,4.899966
1992,6.871597,4.647172
1993,6.23261,4.59244
1994,5.646683,5.048937
1995,5.328408,5.184318
1996,5.345382,5.079936
1997,4.847548,5.171746
1998,4.355443,5.010969
1999,4.252663,5.028326


In [67]:
neets_work_dict = {'not_looking_rate': 'darkblue','looking_rate': 'lightblue'}
fig = go.Figure()
for col in neets_work_dict.keys():
    fig.add_trace(go.Scatter(
        x=neets_work.index, y=neets_work[col].round(2),
        name=col,
        mode='lines',
        stackgroup='one',
        line_color=neets_work_dict[col]
    ))
fig.update_layout(
    font={'family': 'Georgia, serif'},
    title={'text': '<b>Male NEET rates for 16-24 y.o. by Labor Force Status (1990-2024)</b>',
           'subtitle': {'text': '(<b>excluding summer months</b>)'}},
    legend = {'orientation': 'v', 'title': {'text': '<b>group</b><br>(click to toggle lines)'}},
    xaxis={'title':{'text': '<b>Year'}}, yaxis={'title':{'text': '<b>NEET rate (%)'}},
    width=750, height=500,
    template='plotly_white',
    margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 100}
)
fig.update_yaxes(range=[0,20])

### 6. 2024 Compositions

In this section, we break down NEET and non-NEET men and women by whether they're seeking work or not, and if they're disabled or parents.

In [211]:
looking_comp = timetrends.query('YEAR==2024 and ' \
                                'looking_for_work=="looking"').groupby(['NEET',
                                                                        'looking_for_work','sex'])['WTFINL'].sum().unstack()
looking_comp['group'] = looking_comp.index
not_looking_comp = timetrends.query('YEAR==2024 and ' \
                                'looking_for_work=="not_looking"').groupby(['dis','kids',
                                                                            'NEET','looking_for_work','sex'])['WTFINL'].sum().unstack()
not_looking_comp['group'] = not_looking_comp.index
comp_merged = pd.concat([looking_comp,not_looking_comp],ignore_index=True)
comp_merged['group'] = comp_merged['group'].astype(str).str.replace(r'\(|\)|,|\'','',regex=True).str.replace(' ',',')
comp_merged

sex,men,women,group
0,6352238.0,4516327.0,"neet,looking"
1,2978539.0,2694375.0,"not_neet,looking"
2,44740.34,226713.6,"has_dis,has_child,neet,not_looking"
3,23481.02,45321.29,"has_dis,has_child,not_neet,not_looking"
4,3235481.0,2150068.0,"has_dis,no_child,neet,not_looking"
5,3462376.0,2556237.0,"has_dis,no_child,not_neet,not_looking"
6,300278.4,3581211.0,"no_dis,has_child,neet,not_looking"
7,162866.4,720578.1,"no_dis,has_child,not_neet,not_looking"
8,11378530.0,11309110.0,"no_dis,no_child,neet,not_looking"
9,59620240.0,57989900.0,"no_dis,no_child,not_neet,not_looking"


For now, let's just plot the NEETs.

In [255]:
neets_col = [grp for grp in comp_merged['group'] if 'not_neet' not in grp]
neets_comp = comp_merged.loc[comp_merged['group'].isin(neets_col)].copy()
neets_comp['men'] = (neets_comp['men'] / 
                     timetrends.query('YEAR==2024 and sex=="men" and NEET=="neet"')['WTFINL'].sum()) * 100
neets_comp['women'] = (neets_comp['women'] / 
                     timetrends.query('YEAR==2024 and sex=="women" and NEET=="neet"')['WTFINL'].sum()) * 100

fig = go.Figure()
for nm in neets_comp['group']:
    nm2 = nm.replace(',','<br>')
    fig.add_trace(go.Bar(y=neets_comp.loc[neets_comp['group']==nm, 'men'],
                         x=[f'({nm2}<br><b>men</b>)']))
    fig.add_trace(go.Bar(y=neets_comp.loc[neets_comp['group']==nm, 'women'],
                         x=[f'({nm2}<br><b>women</b>)']))
fig.update_layout(
    font={'family': 'Georgia, serif'},
    title={'text': '<b>NEET composition for 16-24 y.o. (2024)</b>',
           'subtitle': {'text': '(<b>excluding summer months</b>)'}},
    showlegend=False,
    xaxis={'title':{'text': '<b>Group'}}, yaxis={'title':{'text': '<b>Within-Gender Share (%)'}},
    width=1000, height=600,
    template='plotly_white',
    margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 50}
)
fig.show()