# Data Analysis at Scale on Filtered Data

Use the already cleaned out dataset.

In [1]:
import dask, dask.dataframe as dd, dask.array as da
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt
import pandas as pd
import re, csv, os
import numpy as np
from dask import delayed, persist
from dask.distributed import Client
from glob import glob

pd.set_option('display.max_columns', None)
csv.field_size_limit(10000000)

%matplotlib inline

In [2]:
client = Client(
    n_workers=2,
    threads_per_worker=1,
    memory_limit='3GB'
)
client

0,1
Client  Scheduler: tcp://127.0.0.1:59048  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 6.00 GB


In [2]:
x = da.random.random((10000, 10000), chunks='16 MiB')
x

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,12.50 MB
Shape,"(10000, 10000)","(1250, 1250)"
Count,64 Tasks,64 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 800.00 MB 12.50 MB Shape (10000, 10000) (1250, 1250) Count 64 Tasks 64 Chunks Type float64 numpy.ndarray",10000  10000,

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,12.50 MB
Shape,"(10000, 10000)","(1250, 1250)"
Count,64 Tasks,64 Chunks
Type,float64,numpy.ndarray


In [3]:
%%time

y = (x + x.T) - x.mean(axis=0)
y.sum().compute()

CPU times: user 2.28 s, sys: 406 ms, total: 2.69 s
Wall time: 562 ms


50000057.17807767

In [4]:
del y
del x

In [5]:
path = '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps'
# partitions_in = 1000
partitions_out = 12

In [6]:
col_names = ['JobID', 'CleanJobTitle', 'CanonCity', 'CanonState', 'Source', 'Latitude', 
             'Longitude', 'CanonJobTitle', 'CanonCounty', 'MSA', 'LMA', 'InternshipFlag',
             'ConsolidatedONET', 'CanonSkillClusters', 'CanonSkills', 'CanonMinimumDegree',
             'CanonRequiredDegrees', 'MinExperience', 'ConsolidatedInferredNAICS', 'BGTOcc',
             'YearsOfExperience', 'CanonJobHours', 'CanonJobType', 'CanonPostalCode', 
             'CanonYearsOfExperienceCanonLevel', 'CanonYearsOfExperienceLevel', 'ConsolidatedTitle',
             'Language', 'BGTSubOcc', 'ConsolidatedDegreeLevels', 'MinDegreeLevel', 'EmployerClean',
             'clean_text', 'JobDate']

In [7]:
dtypes={'CanonSkills': np.str, 'Latitude': np.float32, 'JobID': np.str, 'CanonJobTitle': np.str,
        'CanonYearsOfExperienceLevel': np.str, 'Longitude': np.float32, 'CanonJobType': np.str, 
        'CleanJobTitle': np.str, 'ConsolidatedInferredNAICS': np.str, 'CanonRequiredDegrees': np.str,
        'YearsOfExperience': np.str, 'CanonCity': np.str, 'CanonCounty': np.str, 'CanonJobHours': np.str,
        'CanonState': np.str, 'ConsolidatedONET': np.str, 'MSA': np.str, 'CanonMinimumDegree': np.str,
        'ConsolidatedDegreeLevels': np.str, 'BGTSubOcc': np.str, 'ConsolidatedTitle': np.str,
        'CanonSkillClusters': np.str, 'Language': np.str, 'JobDate': np.str,
        'MinDegreeLevel': np.str, 'LMA': np.str, 'MinExperience': np.str, 'CanonPostalCode': np.str,
        'InternshipFlag': np.bool_, 'Source': np.str, 'BGTOcc': np.str,
        'CanonYearsOfExperienceCanonLevel': np.str
                       }

In [8]:
ddf = dd.read_csv(os.path.join(path, 'da*.csv'), 
                 engine='python',
#                  sample=250_000_000,
                 dtype=dtypes,
                 assume_missing=True,
                 error_bad_lines=False,
                 blocksize=None,
#                  blocksize="250MB",
                 usecols=col_names,
                )
ddf

Unnamed: 0_level_0,JobID,CleanJobTitle,CanonCity,CanonState,Source,Latitude,Longitude,CanonJobTitle,CanonCounty,MSA,LMA,InternshipFlag,ConsolidatedONET,CanonSkillClusters,CanonSkills,CanonMinimumDegree,CanonRequiredDegrees,MinExperience,ConsolidatedInferredNAICS,BGTOcc,YearsOfExperience,CanonJobHours,CanonJobType,CanonPostalCode,CanonYearsOfExperienceCanonLevel,CanonYearsOfExperienceLevel,ConsolidatedTitle,Language,BGTSubOcc,ConsolidatedDegreeLevels,MinDegreeLevel,EmployerClean,clean_text,JobDate
npartitions=12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
,object,object,object,object,object,float32,float32,object,object,object,object,bool,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [9]:
%%time

ddf.tail()

CPU times: user 1.55 s, sys: 164 ms, total: 1.72 s
Wall time: 1.84 s


Unnamed: 0,JobID,CleanJobTitle,CanonCity,CanonState,Source,Latitude,Longitude,CanonJobTitle,CanonCounty,MSA,LMA,InternshipFlag,ConsolidatedONET,CanonSkillClusters,CanonSkills,CanonMinimumDegree,CanonRequiredDegrees,MinExperience,ConsolidatedInferredNAICS,BGTOcc,YearsOfExperience,CanonJobHours,CanonJobType,CanonPostalCode,CanonYearsOfExperienceCanonLevel,CanonYearsOfExperienceLevel,ConsolidatedTitle,Language,BGTSubOcc,ConsolidatedDegreeLevels,MinDegreeLevel,EmployerClean,clean_text,JobDate
21425,38550738204,Grounds/Light Maintenance,Ellenton,FL,Job Board,27.532301,-82.5009,Unknown,Manatee,35840: Metropolitan Statistical Area,MT123584,False,49907100,Architecture and Construction: Carpentry;Speci...,{'Carpentry': 'Architecture and Construction: ...,Higher Secondary Certificate,Higher Secondary Certificate,1,Unknown,49-9071.91,1-3 years,fulltime,permanent,34222,1-6,mid,Grounds/Light Maintenance,en,Building and General Maintenance Technician,12,12,Equity Lifestyle Properties,Job Information Equity Lifestyle Properties Gr...,2019-06-24
21426,38550738965,Checker - Gate,Augusta,GA,Job Board,33.461201,-81.971703,Unknown,Richmond,12260: Metropolitan Statistical Area,MT131226,False,43511100,Supply Chain and Logistics: Inventory Maintena...,{'Inventory Maintenance': 'Supply Chain and Lo...,Higher Secondary Certificate,Higher Secondary Certificate,0.5,312111,43-5111.00,6 months,Unknown,Unknown,30901,0-1,low,Checker,en,Inventory / Supply Specialist,12,12,Coca-Cola Enterprises Inc.,Job Information Georgia Employer CHECKER - GAT...,2019-06-24
21427,38550738114,Software Engineer,Santa Monica,CA,Job Board,34.0149,-118.490997,Software Development Engineer,Los Angeles,31080: Metropolitan Statistical Area|348: Comb...,DV063108|MT063110,False,15113200,Marketing and Public Relations: Advertising;Sp...,{'Ad Serving': 'Marketing and Public Relations...,Bachelor of Science,Bachelor of Science|Doctor of Philosophy|Maste...,1,541511,15-1131.00,1-4 years|1-4 years,fulltime,permanent,90401,1-6,mid,Software Development Engineer,en,Software Developer / Engineer,16|21|18,16,Houzz,"Software Engineer Santa Monica, California - U...",2019-06-24
21428,38550738900,Group X Instructor,Montgomery,AL,Job Board,32.374298,-86.323898,Unknown,Montgomery,33860: Metropolitan Statistical Area,MT013386,False,39903100,Health Care: General Medicine;Specialized Skil...,{'Anatomy': 'Health Care: General Medicine;Spe...,Unknown,Unknown,Unknown,713940,39-9031.00,Minimum of six months,Unknown,Unknown,36101,Unknown,Unknown,Group X Instructor,en,Personal Trainer / Fitness Instructor,Unknown,Unknown,Gold's Gym,Job Information Gold's Gym Group X Instructor ...,2019-06-24
21429,38550738883,Junior Application Server Systems Administrator,Greenbelt,MD,Job Board,39.000999,-76.876801,Server Systems Administrator,Prince George's,47900: Metropolitan Statistical Area|548: Comb...,DV114789|MT114790,False,15114200,Business: Business Process and Analysis;Specia...,{'Business Process': 'Business: Business Proce...,Bachelor of Science,Bachelor of Science,Unknown,722,15-1142.00,Unknown,Unknown,Unknown,20768,Unknown,Unknown,Server Systems Administrator,en,Systems Administrator,16,16,Paradyme Management,Job Information Paradyme Management Junior App...,2019-06-24


## Measure 1

In [None]:
# the two lines below check for first instance of a keyword OR the next OR the next ...
downward = ddf['clean_text'].str.lower().str.contains(' will supervise | supervising | guiding | mentoring | leading | lead | overseeing | will guide | be in charge of | mentor | coaching | mentoring | coordinating | building teams | build team | guiding | advising | setting performance standard | sets performance standard | resolving conflict | resolves conflict | responsibility for outcomes | responsible for outcomes | directing | appointing | instructing | recruiting | managing | approve | approving | assign | assigning | delegate | delegating | control | controlling | review | reviewing | arbitrate | arbitrating | command | commanding | govern | governing ', regex=True)
upward = ddf['clean_text'].str.lower().str.contains(' reports to | report to | reporting to | answers to | answer to | managed by | responds to | respond to | directed by | receives guidance | receive guidance | supervised by | assists | assist | support | supports | supporting | helps | help | helping ', regex=True)

ddf0 = ddf.assign(downward=downward, upward=upward)#.astype(np.int8)

In [11]:
down_words = [' will supervise ', ' supervising ', ' guiding ', ' mentoring ', ' leading ',
              ' lead ', ' overseeing ', ' will guide ', ' be in charge of ', ' mentor ', 
              ' coaching ', ' mentoring ', ' coordinating ', ' building teams ', ' build team ', 
              ' guiding ', ' advising ', ' setting performance standard ', ' sets performance standard ',
              ' resolving conflict ', ' resolves conflict ', ' responsibility for outcomes ', 
              ' responsible for outcomes ', ' directing ', ' appointing ', ' instructing ',
              ' recruiting ', ' managing ', ' approve ', ' approving ', ' assign ', ' assigning ',
              ' delegate ', ' delegating ', ' control ', ' controlling ', ' review ', ' reviewing ',
              ' arbitrate ', ' arbitrating ', ' command ', ' commanding ', ' govern ', ' governing ']

up_words = [' reports to ', ' report to ', ' reporting to ', ' answers to ', ' answer to ', 
            ' managed by ', ' responds to ', ' respond to ', ' directed by ', ' receives guidance ',
            ' receive guidance ', ' supervised by ', ' assists ', ' assist ', ' support ', 
            ' supports ', ' supporting ', ' helps ', ' help ', ' helping ']

In [12]:
from typing import List

def get_indicators(data: pd.DataFrame, column: str, words: List[str]) -> pd.DataFrame:
    for word in words: # and assign the keyword as a variable and a 1 if the word was found
        data[word.strip()] = data[column].str.lower().str.contains(word)
    return data

In [13]:
ddf1 = ddf0.map_partitions(get_indicators, column='clean_text', words=down_words)
ddf2 = ddf1.map_partitions(get_indicators, column='clean_text', words=up_words)

## Measure 2

In [14]:
up_stripped = [w.strip() for w in up_words]
down_stripped = [w.strip() for w in down_words]

In [15]:
up_instances = ddf2.loc[:, up_stripped].sum(axis=1)
down_instances = ddf2.loc[:, down_stripped].sum(axis=1)
ddf3 = ddf2.assign(up_instances=up_instances, down_instances=down_instances)

In [16]:
from typing import Union

def get_words(word: str, string: str) -> Union[str, None]:
    if word in string:
        return string[string.index(word):string.index(word) + 60]

def get_some_text(data: pd.DataFrame, column: str, list_of_words: List[str]) -> pd.DataFrame:
    for word in list_of_words:
        data[word.strip()] = data[column].apply(lambda x: get_words(word, x))
    return data

In [17]:
ddf4 = ddf3.map_partitions(get_some_text, column='clean_text', list_of_words=down_words)
ddf5 = ddf4.map_partitions(get_some_text, column='clean_text', list_of_words=up_words)
ddf5.head()

Unnamed: 0,JobID,CleanJobTitle,CanonCity,CanonState,Source,Latitude,Longitude,CanonJobTitle,CanonCounty,MSA,LMA,InternshipFlag,ConsolidatedONET,CanonSkillClusters,CanonSkills,CanonMinimumDegree,CanonRequiredDegrees,MinExperience,ConsolidatedInferredNAICS,BGTOcc,YearsOfExperience,CanonJobHours,CanonJobType,CanonPostalCode,CanonYearsOfExperienceCanonLevel,CanonYearsOfExperienceLevel,ConsolidatedTitle,Language,BGTSubOcc,ConsolidatedDegreeLevels,MinDegreeLevel,EmployerClean,clean_text,JobDate,downward,upward,will supervise,supervising,guiding,mentoring,leading,lead,overseeing,will guide,be in charge of,mentor,coaching,coordinating,building teams,build team,advising,setting performance standard,sets performance standard,resolving conflict,resolves conflict,responsibility for outcomes,responsible for outcomes,directing,appointing,instructing,recruiting,managing,approve,approving,assign,assigning,delegate,delegating,control,controlling,review,reviewing,arbitrate,arbitrating,command,commanding,govern,governing,reports to,report to,reporting to,answers to,answer to,managed by,responds to,respond to,directed by,receives guidance,receive guidance,supervised by,assists,assist,support,supports,supporting,helps,help,helping,up_instances,down_instances
0,38513295895,Obstetrics/Gynecology Physician,Robbinsville,NJ,Job Board,40.2967,-74.651001,Obstetrician/Gynecologist,Mercer,45940: Metropolitan Statistical Area,MT344594,False,29106400,Administration: Administrative Support;Special...,{'Administrative Support': 'Administration: Ad...,Unknown,Unknown,Unknown,62,29-1062.00,Unknown,fulltime,permanent,8691,Unknown,Unknown,Obstetrician/Gynecologist,en,Obstetrician / Gynecologist,Unknown,Unknown,Penn Medicine Princeton Health,Penn Medicine Princeton Health - Job Descripti...,2019-04-02,False,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,support and Magnet-designated nurses. - Date ...,,,,,,1,0
1,38513295910,Neurologist,Terre Haute,IN,Job Board,39.466599,-87.413803,Neurologist,Vigo,45460: Metropolitan Statistical Area,MT184546,False,29106904,Health Care: Neurology;Specialized Skills,{'Neurology': 'Health Care: Neurology;Speciali...,Unknown,Unknown,Unknown,622110,29-1062.00,Unknown,fulltime,permanent,47801,Unknown,Unknown,Neurologist,en,Neurologist,Unknown,Unknown,Hospital Corporation of America,HCA - Hospital Corporation of America - Job De...,2019-04-02,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0
2,38513295942,General Adult Cardiology,Monroe,NC,Job Board,34.985199,-80.549698,Unknown,Union,16740: Metropolitan Statistical Area,MT371674,False,29114100,Health Care: Cardiology;Specialized Skills,{'Cardiology': 'Health Care: Cardiology;Specia...,Unknown,Unknown,Unknown,622110,29-1141.00,Unknown,fulltime,permanent,28111,Unknown,Unknown,General Adult Cardiology,en,Registered Nurse,Unknown,Unknown,Carolinas HealthCare System,"Carolinas HealthCare System, now Atrium Health...",2019-04-02,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0
3,38513295684,Pulmonary/Critical Care/Sleep Physician For Le...,Wichita,KS,Job Board,37.6898,-97.341499,Critical Care Physician,Sedgwick,48620: Metropolitan Statistical Area,MT204862,False,29106900,Health Care: Emergency and Intensive Care;Spec...,{'Critical Care': 'Health Care: Emergency and ...,Unknown,Unknown,Unknown,622110,29-1062.00,Unknown,fulltime,permanent,67201,Unknown,Unknown,Critical Care Physician,en,"Physician, Other",Unknown,Unknown,Hospital Corporation of America,HCA - Hospital Corporation of America - Job De...,2019-04-02,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0
4,38513295621,"Administrative Assistant, Parish Health Minist...",Cincinnati,OH,Company,39.107201,-84.500298,Administrative Assistant,Hamilton,17140: Metropolitan Statistical Area,MT391714,False,43601300,Administration: Administrative Support;Special...,{'Administrative Support': 'Administration: Ad...,Higher Secondary Certificate,Bachelor's|Higher Secondary Certificate,5,6233,43-6013.00,Minimum of 5 years,parttime,permanent,45201,1-6,mid,Administrative Assistant,en,Medical Secretary,16|12,12,Ers,"Administrative Assistant, Parish Health Minist...",2019-04-02,True,True,,,,,,lead volunteers and assists with projects as ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,assists with projects as needed Assists with ...,assist with event planning and to promote the...,support our mission to enrich the lives of ol...,,,,,,4,1


## Meassure 3

In [19]:
JobDate = dd.to_datetime(ddf5['JobDate'])
ddf6 = ddf5.assign(JobDate=JobDate)
weeks = ddf6['JobDate'].dt.week
years = ddf6['JobDate'].dt.year
ddf7 = ddf6.assign(weeks=weeks, years=years)

In [20]:
ddf_deduplicated = ddf7.drop_duplicates(subset=['CleanJobTitle'])
firm_full_sample = ddf_deduplicated.groupby('EmployerClean')[['CleanJobTitle', 'ConsolidatedTitle', 'CanonJobTitle']].count().reset_index()
firm_year = ddf_deduplicated.groupby(['EmployerClean', 'years'])[['CleanJobTitle', 'ConsolidatedTitle', 'CanonJobTitle']].count().reset_index()
firm_week = ddf_deduplicated.groupby(['EmployerClean', 'weeks'])[['CleanJobTitle', 'ConsolidatedTitle', 'CanonJobTitle']].count().reset_index()

In [22]:
%%time

firm_full_sample, firm_year, firm_week = dask.compute(firm_full_sample, firm_year, firm_week)

CPU times: user 3min 32s, sys: 34.8 s, total: 4min 7s
Wall time: 4min 27s


In [27]:
firm_week.head(5)

Unnamed: 0,EmployerClean,weeks,CleanJobTitle,ConsolidatedTitle,CanonJobTitle
0,1 Source,14,1,1,1
1,1-800-GOT-JUNK?,14,9,9,9
2,1-800-GOT-JUNK?,25,6,6,6
3,1010Data,14,1,1,1
4,10Th Magnitude,14,2,2,2


## Meassure 4

### Part 1

In [29]:
occu_condition = ddf7['BGTOcc'].str.startswith('11')
managers_dummy_df = ddf7.assign(managerial_occu=occu_condition)
managers_only_df = managers_dummy_df[managers_dummy_df['managerial_occu'] == True]

In [31]:
managers_group1 = managers_only_df.groupby(['EmployerClean', 'CanonState', 'CanonCounty', 'CanonPostalCode', 'weeks', 'BGTOcc'])
individual_managers = managers_group1[['CleanJobTitle', 'ConsolidatedTitle', 'CanonJobTitle']].count().reset_index()

### Part 2

In [32]:
managers_group2 = managers_only_df.groupby(['EmployerClean', 'CanonState', 'CanonCounty', 'CanonPostalCode', 'weeks'])
all_managers = managers_group2[['CleanJobTitle', 'ConsolidatedTitle', 'CanonJobTitle']].count().reset_index()

### Part 3

In [34]:
firm_loc_week_group = ddf7.groupby(['EmployerClean', 'CanonState', 'CanonCounty', 'CanonPostalCode', 'weeks'])
firm_loc_week_df = firm_loc_week_group[['CleanJobTitle', 'ConsolidatedTitle', 'CanonJobTitle']].count().reset_index()

In [35]:
%%time

individual_managers, all_managers, firm_loc_week_df = dask.compute(individual_managers, all_managers, firm_loc_week_df)

CPU times: user 3min 30s, sys: 34.8 s, total: 4min 5s
Wall time: 4min 26s


In [38]:
firm_loc_week_df.head()

Unnamed: 0,EmployerClean,CanonState,CanonCounty,CanonPostalCode,weeks,CleanJobTitle,ConsolidatedTitle,CanonJobTitle
0,1-800-GOT-JUNK?,IN,Marion,46201,14,1,1,1
1,1-800-GOT-JUNK?,UT,Salt Lake,84101,14,1,1,1
2,2020 Companies,CA,Los Angeles,90247,14,1,1,1
3,2020 Companies,CA,San Bernardino,91708,14,1,1,1
4,2020 Companies,CA,Solano,95687,14,1,1,1


## Save all Files

In [43]:
def save_files(new_dir_name, data, new_file_name, pandas_or_dask=True):
    
    if not os.path.exists(os.path.join(path, new_dir_name)):
        os.makedirs(os.path.join(path, new_dir_name))

    if pandas_or_dask == True:
        data.to_csv(os.path.join(path, new_dir_name, new_file_name + '.csv'), index=False)
    else:
        # the following lines of code will take the last dataset, repartition it,
        # and save it to the desired location. Notice the wildcard "*" below. That is
        # the spot Dask will use to number your files starting from 0
        (data
         .repartition(npartitions=partitions_out)
         .to_csv(os.path.join(path, new_dir_name, new_file_name + '*.csv'), index=False)
         )

In [44]:
%%time

save_files(new_dir_name='measure_2', data=ddf5,                new_file_name='keywords_',           pandas_or_dask=False)
save_files(new_dir_name='measure_3', data=firm_full_sample,    new_file_name='firm_full_sample',    pandas_or_dask=True)
save_files(new_dir_name='measure_3', data=firm_year,           new_file_name='firm_year',           pandas_or_dask=True)
save_files(new_dir_name='measure_3', data=firm_week,           new_file_name='firm_week',           pandas_or_dask=True)
save_files(new_dir_name='measure_4', data=individual_managers, new_file_name='individual_managers', pandas_or_dask=True)
save_files(new_dir_name='measure_4', data=all_managers,        new_file_name='all_managers',        pandas_or_dask=True)
save_files(new_dir_name='measure_4', data=firm_loc_week_df,    new_file_name='firm_loc_week_df',    pandas_or_dask=True)

CPU times: user 3min 57s, sys: 53.6 s, total: 4min 51s
Wall time: 4min 59s
