# Synthesizing data

In [81]:
WIKI_FN = '../wiki/data.jobcensus.wiki.2020-11-23.csv'
MLA_FN = '../mla/data.jobcensus.mla.2020-11-23.anno.csv'

In [82]:
import pandas as pd,os,sys
from datetime import datetime
import warnings,re
warnings.filterwarnings('ignore')

## Load data

In [83]:
df_wiki0 = pd.read_csv(WIKI_FN)
df_mla0 = pd.read_csv(MLA_FN)

In [84]:
df_wiki0['JobSource']='Wiki'
df_mla0['JobSource']='JIL'

## Rename cols

In [85]:
df_mla = df_mla0[df_mla0.in_wiki!='y'][['title','year','link','JobType','field','JobSource']]
df_mla.columns=['JobTitle','JobYear','JobLink','JobType','JobField','JobSource']
df_mla

Unnamed: 0,JobTitle,JobYear,JobLink,JobType,JobField,JobSource
0,Mellon Post-Doctoral Fellowship in Latinx Lite...,2020,https://joblist.mla.org/job-details/2878/mello...,Postdoc,Latinx,JIL
1,Early Career Fellowship in Creative Writing: R...,2020,https://joblist.mla.org/job-details/2877/early...,Postdoc,Creative Writing,JIL
2,English Education - Assistant Professor,2020,https://joblist.mla.org/job-details/2747/engli...,TT,TESOL,JIL
3,Tenure-track Assistant Professor in Rhetoric a...,2020,https://joblist.mla.org/job-details/2745/tenur...,TT,Rhetoric/Composition,JIL
4,"Assistant Professor, English",2020,https://joblist.mla.org/job-details/2696/assis...,TT,Anglophone,JIL
...,...,...,...,...,...,...
82,Spanish Language Murphy Visiting Fellowship in...,2020,https://joblist.mla.org/job-details/2880/spani...,Non-TT,Spanish,JIL
83,Assistant Professor of Portuguese,2020,https://mlajoblist-js.madgexccstage.com/job-de...,TT,Portuguese,JIL
84,Assistant Professor of Russian,2020,https://mlajoblist-js.madgexccstage.com/job-de...,TT,Russian,JIL
87,Tenure Track Assistant Professor - Creative Wr...,2020,https://joblist.mla.org/job-details/2694/tenur...,TT,Creative Writing,JIL


In [86]:
# df_wiki0[(df_wiki0.page_group=='Ethnic Studies') & (df_wiki0.IsUni!='y')]

In [87]:
df_wiki = df_wiki0[df_wiki0.IsUni=='y'][['section_title','year','section_links','JobType','page_group','JobSource']].fillna('')
df_wiki.columns=['JobTitle','JobYear','JobLink','JobType','JobField','JobSource']
df_wiki

Unnamed: 0,JobTitle,JobYear,JobLink,JobType,JobField,JobSource
3,Generalist & Non-National Categories,2011,https://www.wikia.org/ | https://www.wikia.org...,Unknown,English Literature,Wiki
4,AY 2011-12 English Lit Salaries,2011,,TT,English Lit Salaries,Wiki
5,See Also,2011,http://chronicle.com/article/Average-Faculty-S...,Unknown,English Lit Salaries,Wiki
10,"Bard College (Annandale-on-Hudson, NY)",2011,,TT,Medieval,Wiki
11,"Bowdoin College (Brunswick, ME)",2011,,Unknown,Medieval,Wiki
...,...,...,...,...,...,...
15662,Johns Hopkins University.,2020,https://apply.interfolio.com/80430,Unknown,Jewish Studies,Wiki
15663,University of Portland (USA:OR).,2020,https://jobs.chronicle.com/job/288460/associat...,TT,Jewish Studies,Wiki
15664,University of British Columbia (Canada),2020,,TT,Jewish Studies,Wiki
15665,University of Portland (USA:OR).,2020,https://jobs.chronicle.com/job/288460/associat...,TT,Native American Studies,Wiki


In [88]:
# df_wiki[(df_wiki.JobYear==2020) & (df_wiki.JobField=='Comparative Literature')]

In [89]:
#df_wiki[df_wiki.JobField=='Ethnic Studies']

### Clean links

In [90]:
df_mla['JobLink_norm']=df_mla.JobLink.apply(lambda x: '/'.join(x.split('/')[:-2]))
# df_mla.JobLink_norm.value_counts()

In [91]:
def clean_wiki_links(x):
    for l in str(x).split(' | '):
        if 'joblist.mla.org' in l:
            return '/'.join(l.split('/')[:-2])
    return x
df_wiki['JobLink_norm']=df_wiki['JobLink'].apply(clean_wiki_links)
# df_wiki.JobLink_norm.value_counts()

### Find overlap

In [92]:
df_all = df_wiki.append(df_mla)
df_all

Unnamed: 0,JobTitle,JobYear,JobLink,JobType,JobField,JobSource,JobLink_norm
3,Generalist & Non-National Categories,2011,https://www.wikia.org/ | https://www.wikia.org...,Unknown,English Literature,Wiki,https://www.wikia.org/ | https://www.wikia.org...
4,AY 2011-12 English Lit Salaries,2011,,TT,English Lit Salaries,Wiki,
5,See Also,2011,http://chronicle.com/article/Average-Faculty-S...,Unknown,English Lit Salaries,Wiki,http://chronicle.com/article/Average-Faculty-S...
10,"Bard College (Annandale-on-Hudson, NY)",2011,,TT,Medieval,Wiki,
11,"Bowdoin College (Brunswick, ME)",2011,,Unknown,Medieval,Wiki,
...,...,...,...,...,...,...,...
82,Spanish Language Murphy Visiting Fellowship in...,2020,https://joblist.mla.org/job-details/2880/spani...,Non-TT,Spanish,JIL,https://joblist.mla.org/job-details/2880
83,Assistant Professor of Portuguese,2020,https://mlajoblist-js.madgexccstage.com/job-de...,TT,Portuguese,JIL,https://mlajoblist-js.madgexccstage.com/job-de...
84,Assistant Professor of Russian,2020,https://mlajoblist-js.madgexccstage.com/job-de...,TT,Russian,JIL,https://mlajoblist-js.madgexccstage.com/job-de...
87,Tenure Track Assistant Professor - Creative Wr...,2020,https://joblist.mla.org/job-details/2694/tenur...,TT,Creative Writing,JIL,https://joblist.mla.org/job-details/2694


In [93]:
# drop duplicates
df_all=df_all.drop_duplicates(['JobTitle','JobYear','JobLink'])
df_all

Unnamed: 0,JobTitle,JobYear,JobLink,JobType,JobField,JobSource,JobLink_norm
3,Generalist & Non-National Categories,2011,https://www.wikia.org/ | https://www.wikia.org...,Unknown,English Literature,Wiki,https://www.wikia.org/ | https://www.wikia.org...
4,AY 2011-12 English Lit Salaries,2011,,TT,English Lit Salaries,Wiki,
5,See Also,2011,http://chronicle.com/article/Average-Faculty-S...,Unknown,English Lit Salaries,Wiki,http://chronicle.com/article/Average-Faculty-S...
10,"Bard College (Annandale-on-Hudson, NY)",2011,,TT,Medieval,Wiki,
11,"Bowdoin College (Brunswick, ME)",2011,,Unknown,Medieval,Wiki,
...,...,...,...,...,...,...,...
82,Spanish Language Murphy Visiting Fellowship in...,2020,https://joblist.mla.org/job-details/2880/spani...,Non-TT,Spanish,JIL,https://joblist.mla.org/job-details/2880
83,Assistant Professor of Portuguese,2020,https://mlajoblist-js.madgexccstage.com/job-de...,TT,Portuguese,JIL,https://mlajoblist-js.madgexccstage.com/job-de...
84,Assistant Professor of Russian,2020,https://mlajoblist-js.madgexccstage.com/job-de...,TT,Russian,JIL,https://mlajoblist-js.madgexccstage.com/job-de...
87,Tenure Track Assistant Professor - Creative Wr...,2020,https://joblist.mla.org/job-details/2694/tenur...,TT,Creative Writing,JIL,https://joblist.mla.org/job-details/2694


In [94]:
# sanity check: should be 4 -->
# df_all[(df_all.JobField=='Comparative Literature') & (df_all.JobYear==2020)]

## Prune out unknowns

In [95]:
def get_jobtype(title,link=''):
    title=title.lower()
    rgx = re.compile("(\w[\w']*\w|\w)")
    title_words=rgx.findall(title)
    if 'post-doc' in title or 'postdoc' in title: return 'Postdoc'
    if 'visiting' in title: return 'Non-TT'
    if 'professor' in title or 'open-rank' in title: return 'TT'
    if 'fellowship' in title or 'fellow' in title: return 'Postdoc'
    if 'adjunct' in title or 'part-time' in title: return 'Non-TT'
    if 'director' in title: return 'Non-TT'
    if 'instructor' in title or 'freelance' in title: return 'Non-TT'
    if 'vap' in title_words: return 'Non-TT'
    if 'lecturer' in title: return 'Non-TT'
    if link: return get_jobtype_from_mla(link)
    return 'Unknown'

In [96]:
# Fix missing job type errors
df_all['JobType']=[get_jobtype(x) if y=='Unknown' else y
                    for x,y in zip(df_all['JobTitle'],df_all['JobType'])]#.apply(get_jobtype)

In [97]:
df_all[df_all.JobType=='Unknown']

Unnamed: 0,JobTitle,JobYear,JobLink,JobType,JobField,JobSource,JobLink_norm
3,Generalist & Non-National Categories,2011,https://www.wikia.org/ | https://www.wikia.org...,Unknown,English Literature,Wiki,https://www.wikia.org/ | https://www.wikia.org...
5,See Also,2011,http://chronicle.com/article/Average-Faculty-S...,Unknown,English Lit Salaries,Wiki,http://chronicle.com/article/Average-Faculty-S...
11,"Bowdoin College (Brunswick, ME)",2011,,Unknown,Medieval,Wiki,
13,"Brown University (Providence, RI)",2011,http://chronicle.com/jobs/0000646693-01/,Unknown,Medieval,Wiki,http://chronicle.com/jobs/0000646693-01/
25,Hebrew University of Jerusalem,2011,http://www.hum.huji.ac.il/new.php?cat=3223,Unknown,Medieval,Wiki,http://www.hum.huji.ac.il/new.php?cat=3223
...,...,...,...,...,...,...,...
15588,Discussion,2020,https://www.wikia.org/ | https://www.wikia.org...,Unknown,German,Wiki,https://www.wikia.org/ | https://www.wikia.org...
15595,Boston University (USA:MA).,2020,https://jobs.chronicle.com/job/291088/martin-l...,Unknown,African %26 African American Studies,Wiki,https://jobs.chronicle.com/job/291088/martin-l...
15641,University of Notre Dame (USA:IN).,2020,https://apply.interfolio.com/79997,Unknown,African %26 African American Studies,Wiki,https://apply.interfolio.com/79997
15660,Stanford University (USA:CA),2020,https://gender.stanford.edu/fellowships/postdo...,Unknown,Asian %26 Asian American Studies,Wiki,https://gender.stanford.edu/fellowships/postdo...


## Save field names

In [98]:
# Download names
field_alias_fn = 'data.jobcensus.field_aliases.csv'
if not os.path.exists(field_alias_fn):
    df_aliases=df_all.JobField.value_counts().reset_index()
    df_aliases.columns=['JobField','Count']
    df_aliases['FieldAlias']=df_aliases['JobField']
    df_aliases[['JobField','FieldAlias','Count']].sort_values('JobField').to_csv(field_alias_fn,index=False)

In [99]:
# Edit them!

# Save

In [100]:
now=datetime.now()
date=f'{now.year}-{now.month}-{now.day}'
ofn=f'data.jobcensus.synthesized.{date}.csv'
df_all.to_csv(ofn,index=False)