# Synthesizing data

In [225]:
WIKI_FN = '../wiki/data.jobcensus.wiki.2020-11-23.csv'
MLA_FN = '../mla/data.jobcensus.mla.2020-11-23.anno.csv'

In [226]:
import pandas as pd,os,sys
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

## Load data

In [227]:
df_wiki0 = pd.read_csv(WIKI_FN)
df_mla0 = pd.read_csv(MLA_FN)

In [228]:
df_wiki0['JobSource']='Wiki'
df_mla0['JobSource']='JIL'

## Rename cols

In [229]:
df_mla = df_mla0[df_mla0.in_wiki!='y'][['title','year','link','JobType','field','JobSource']]
df_mla.columns=['JobTitle','JobYear','JobLink','JobType','JobField','JobSource']
df_mla

Unnamed: 0,JobTitle,JobYear,JobLink,JobType,JobField,JobSource
0,Mellon Post-Doctoral Fellowship in Latinx Lite...,2020,https://joblist.mla.org/job-details/2878/mello...,Postdoc,Latinx,JIL
1,Early Career Fellowship in Creative Writing: R...,2020,https://joblist.mla.org/job-details/2877/early...,Postdoc,Creative Writing,JIL
2,English Education - Assistant Professor,2020,https://joblist.mla.org/job-details/2747/engli...,TT,TESOL,JIL
3,Tenure-track Assistant Professor in Rhetoric a...,2020,https://joblist.mla.org/job-details/2745/tenur...,TT,Rhetoric/Composition,JIL
4,"Assistant Professor, English",2020,https://joblist.mla.org/job-details/2696/assis...,TT,Anglophone,JIL
...,...,...,...,...,...,...
82,Spanish Language Murphy Visiting Fellowship in...,2020,https://joblist.mla.org/job-details/2880/spani...,Non-TT,Spanish,JIL
83,Assistant Professor of Portuguese,2020,https://mlajoblist-js.madgexccstage.com/job-de...,TT,Portuguese,JIL
84,Assistant Professor of Russian,2020,https://mlajoblist-js.madgexccstage.com/job-de...,TT,Russian,JIL
87,Tenure Track Assistant Professor - Creative Wr...,2020,https://joblist.mla.org/job-details/2694/tenur...,TT,Creative Writing,JIL


In [230]:
df_wiki = df_wiki0[df_wiki0.IsUni=='y'][['section_title','year','section_links','JobType','page_group','JobSource']].fillna('')
df_wiki.columns=['JobTitle','JobYear','JobLink','JobType','JobField','JobSource']
df_wiki

Unnamed: 0,JobTitle,JobYear,JobLink,JobType,JobField,JobSource
10,"Bard College (Annandale-on-Hudson, NY)",2011,,TT,Medieval,Wiki
11,"Bowdoin College (Brunswick, ME)",2011,,Unknown,Medieval,Wiki
12,"Boston College (Chestnut Hill, MA)",2011,,TT,Medieval,Wiki
13,"Brown University (Providence, RI)",2011,http://chronicle.com/jobs/0000646693-01/,Unknown,Medieval,Wiki
14,"Central Methodist University (Fayette, MO)",2011,http://www.higheredjobs.com/faculty/details.cf...,TT,Medieval,Wiki
...,...,...,...,...,...,...
15648,Johns Hopkins University.,2020,https://apply.interfolio.com/80430,Unknown,Jewish Studies,Wiki
15649,University of Portland (USA:OR).,2020,https://jobs.chronicle.com/job/288460/associat...,TT,Jewish Studies,Wiki
15650,University of British Columbia (Canada),2020,,TT,Jewish Studies,Wiki
15651,University of Portland (USA:OR).,2020,https://jobs.chronicle.com/job/288460/associat...,TT,Native American Studies,Wiki


In [231]:
df_wiki[(df_wiki.JobYear==2020) & (df_wiki.JobField=='Comparative Literature')]

Unnamed: 0,JobTitle,JobYear,JobLink,JobType,JobField,JobSource
15529,"UC Berkeley (Berkeley, CA).",2020,https://aprecruit.berkeley.edu/JPF02679,TT,Comparative Literature,Wiki
15530,Université de Caen (FRA:CAL).,2020,https://euraxess.ec.europa.eu/jobs/537836,Unknown,Comparative Literature,Wiki
15531,"Cambridge University (Cambridge, UK)",2020,https://www.jobs.cam.ac.uk/job/27641/,Unknown,Comparative Literature,Wiki
15532,"Stanford University (Stanford, CA).",2020,https://gender.stanford.edu/fellowships/postdo...,Unknown,Comparative Literature,Wiki


### Clean links

In [232]:
df_mla['JobLink_norm']=df_mla.JobLink.apply(lambda x: '/'.join(x.split('/')[:-2]))
# df_mla.JobLink_norm.value_counts()

In [233]:
def clean_wiki_links(x):
    for l in str(x).split(' | '):
        if 'joblist.mla.org' in l:
            return '/'.join(l.split('/')[:-2])
    return x
df_wiki['JobLink_norm']=df_wiki['JobLink'].apply(clean_wiki_links)
# df_wiki.JobLink_norm.value_counts()

### Find overlap

In [234]:
df_all = df_wiki.append(df_mla)
df_all

Unnamed: 0,JobTitle,JobYear,JobLink,JobType,JobField,JobSource,JobLink_norm
10,"Bard College (Annandale-on-Hudson, NY)",2011,,TT,Medieval,Wiki,
11,"Bowdoin College (Brunswick, ME)",2011,,Unknown,Medieval,Wiki,
12,"Boston College (Chestnut Hill, MA)",2011,,TT,Medieval,Wiki,
13,"Brown University (Providence, RI)",2011,http://chronicle.com/jobs/0000646693-01/,Unknown,Medieval,Wiki,http://chronicle.com/jobs/0000646693-01/
14,"Central Methodist University (Fayette, MO)",2011,http://www.higheredjobs.com/faculty/details.cf...,TT,Medieval,Wiki,http://www.higheredjobs.com/faculty/details.cf...
...,...,...,...,...,...,...,...
82,Spanish Language Murphy Visiting Fellowship in...,2020,https://joblist.mla.org/job-details/2880/spani...,Non-TT,Spanish,JIL,https://joblist.mla.org/job-details/2880
83,Assistant Professor of Portuguese,2020,https://mlajoblist-js.madgexccstage.com/job-de...,TT,Portuguese,JIL,https://mlajoblist-js.madgexccstage.com/job-de...
84,Assistant Professor of Russian,2020,https://mlajoblist-js.madgexccstage.com/job-de...,TT,Russian,JIL,https://mlajoblist-js.madgexccstage.com/job-de...
87,Tenure Track Assistant Professor - Creative Wr...,2020,https://joblist.mla.org/job-details/2694/tenur...,TT,Creative Writing,JIL,https://joblist.mla.org/job-details/2694


In [258]:
# sanity check: should be 4 -->
# df_all[(df_all.JobField=='Comparative Literature') & (df_all.JobYear==2020)]

## Save field names

In [259]:
# Download names
field_alias_fn = 'data.jobcensus.field_aliases.csv'
if not os.path.exists(field_alias_fn):
    df_aliases=df_all.JobField.value_counts().reset_index()
    df_aliases.columns=['JobField','Count']
    df_aliases['FieldAlias']=df_aliases['JobField']
    df_aliases[['JobField','FieldAlias','Count']].sort_values('JobField').to_csv(field_alias_fn,index=False)

In [260]:
# Edit them!

# Save

In [256]:
now=datetime.now()
date=f'{now.year}-{now.month}-{now.day}'
ofn=f'data.jobcensus.synthesized.{date}.csv'
df_all.to_csv(ofn,index=False)