# Census for Academic Jobs

## Setup

In [None]:
# need to install wikia api
!pip install wikia tqdm pandas bs4 -q

In [None]:
import os,sys
import pandas as pd
import wikia
from wikia import WikiaError
from tqdm import tqdm
from datetime import datetime
import bs4

In [None]:
# some constants
WIKI_NAME = 'academicjobs'
MAIN_PAGE_NAME = 'Academic_Jobs_Wiki'

## Find relevant pages

### Get the top level discipline pages

In [None]:
# what year is it
now=datetime.now()
the_year = now.year

In [None]:
# earliest year on the main page is 2007, but the formatting is standardized around 2011
years = list(range(2011, the_year + 1))
years

In [None]:
# Change for history?
disc_name = 'English Literature'

In [None]:
# get pages for discipline; 
disc_pages=[f'{disc_name} {year}-{year+1 if year!=2013 else 14}' for year in years] # fix for 2013

# fix for 2020
disc_pages += ['Ethnic Studies 2020-2021']
years += [2020]

# disc_pages,years
# for dp in disc_pages:
#     print(f'[{dp.split()[-1].split("-")[0]}](https://academicjobs.wikia.org/wiki/{dp.replace(" ","_")})', end=' | ')

### Get links from discipline pages

In [None]:
def get_links_from_disc_page(disc_page_name,year):
    # get page from wikia
    page=wikia.page(WIKI_NAME, disc_page_name)
    
    # get html
    html = page.html()
    
    # read html
    dom = bs4.BeautifulSoup(html)
    
    # get links
    links=[]
    
    for link in dom('a'):
        try:
            href=link['href']
        except KeyError:
            continue
        if not '/wiki/' in href: continue
        wikilink=href.split('/wiki/')[1]
        if ':' in wikilink or '?' in wikilink: continue
        if wikilink==disc_page_name.replace(' ','_'): continue
        if year and (not str(year) in wikilink and not str(year+1) in wikilink) or (str(int(year)-1) in wikilink): continue
        
        if not wikilink in links:
            links+=[wikilink]
    
    return links

In [None]:
# get_links_from_disc_page('Ethnic_Studies_2020-2021',year=2020)

In [None]:
# Get all links

def strip_year_from_page(page_name):
    pdat=page_name.split('_')
    return ' '.join(pdat[:-1])

def get_all_links():
    link_ld=[]
    for dpage,dyear in tqdm(list(zip(disc_pages,years))):
        links=get_links_from_disc_page(dpage, year=dyear)
        for link in links:
            link_d={'disc_page':dpage, 'year':dyear, 'page':link, 'page_group':strip_year_from_page(link)}
            link_ld.append(link_d)

    return link_ld

In [None]:
LINK_LD = get_all_links()
len(LINK_LD)

### Clean links

In [None]:
df_pages=pd.DataFrame(LINK_LD)
df_pages

In [None]:
# print for readme
for pg,pgdf in sorted(df_pages.groupby('page_group')):
    print(f'''* {pg.replace("%26",'&').replace('%27',"'")}:''',end=' ')
    yrs=[]
    for dp in sorted(pgdf.page):
        yrs+=[f'''[{dp.split("_")[-1].split("-")[0]}](https://academicjobs.wikia.org/wiki/{dp.replace(" ","_")})''']
    print(' | '.join(yrs))

In [None]:
# Bug where some pages are double counted
df_pages = df_pages.drop_duplicates('page',keep='last')
df_pages

In [None]:
# Testing
# df_pages[df_pages.page.str.contains('African')]

### Cleaning page names

## Step 3: Processing pages

In [None]:
def decide_if_school(title):
    title=str(title)
    #if title in not_unis: return 'n'
    if 'College' in title: return 'y'
    if 'Universit' in title: return 'y'
    if 'UC ' in title: return 'y'
    if 'Demographics' in title: return 'n'
    if 'State' in title: return 'y'
    if any([ (w.startswith('(') and w.endswith(')') and w.upper()==w)   for w in title.split()]): return 'y'
    if '<b>' in title: return 'n'
    return ''

In [None]:
def decide_if_tt(title,ad,nowtt):
    if ' TT ' in title: return 'y'
    if 'Lecturer' in title.split() or 'Visiting Assistant Professor' in title: return 'n'
    if nowtt is not None: return 'y' if nowtt else 'n'

    if "Visiting Assistant Professor" in ad: return 'n'
    if "tenure-track" in ad.lower() or "tenure Track" in ad.lower(): return "y"
    if "Assistant Professor" in ad or "Associate Professor" in ad or "Full Professor" in ad: return "y"
    
    return ''

In [None]:
def decide_job_type(IsTT,page_name):
    if IsTT=='y': return 'TT'
    if 'Postdoc' in page_name: return 'Postdoc'
    if IsTT=='n': return 'Non-TT'
    return 'Unknown'

In [None]:
bad_domains = ['bit.ly','fandom','youtube']

def parse_section(section_dom,section_title,now_isTT,page_name):
    section_content=section_dom.text.replace('Edit\n','')
    #print('\n'*5)
    
    from urllib.parse import urlsplit
    
    links = []
    for a in section_dom('a'):
        try:
            href=a['href']
        except KeyError:
            continue
            
        if '/wiki/' in href: continue
        urldat=urlsplit(href)
        if not urldat.path: continue
        #link=urldat.netloc + urldat.path
        link=href
        
        if any([domain in link for domain in bad_domains]): continue
        links+=[link]
    
    if not section_content: return
    
    # save data for this job
    row = {}
    row['section_content'] = section_content.replace('[edit | edit source]','').strip().replace('\n\n','\n').replace('\n\n','\n').replace('\n\n','\n')
    row['section_links'] = ' | '.join(links)
    row['section_title'] = bs4.BeautifulSoup(section_title).text
    row['IsTT'] = decide_if_tt(row['section_title'], row['section_content'], now_isTT)
    row['IsUni'] = decide_if_school(section_title) # if row['IsTT']!='y' else 'y'
    row['JobType'] = decide_job_type(row['IsTT'], page_name)
    row['JobID'] = row['section_links'] if row['section_links'] else row['section_title']
    return row


def process_page(page_name):
    # get page
    page_name_q = page_name.replace('%26','&').replace('%27',"'")
    page_name_safe = page_name_q.replace('/','_')
    cachefn=f'cache/{page_name_safe}.html'
    if not os.path.exists(cachefn):
        page = wikia.page(WIKI_NAME, page_name_q)
        # get html
        html=page.html()
    else:
#         print('Using cache')
        with open(cachefn) as f: html=f.read()
    
    
    # start data
    data = []
    now_isTT=None
    for section in html.split('mw-headline')[1:]:
        section_title=section.split('</span>')[0].split('">')[-1].strip()
        if 'RECENT ACTIVITY' in section_title: continue
        if 'Demographics' in section_title: continue
        if 'Instructions' in section_title: continue
        if 'Word on the Street' in section_title: continue
        if 'Tenure-Track Positions' in section_title:
            now_isTT=True
            continue
        if 'Visiting Positions' in section_title:
            now_isTT=False
            continue
        if section_title.startswith('Humanities and Social Sciences Postdocs'):
            #now_isTT=False
            continue
        
        if page_name in {'Spanish_and_Portuguese_2020-2021','Film_Studies_2020-2021','French_%26_Francophone_2020-2021'}:
            # these use a different format for no reason!
            for ol in bs4.BeautifulSoup(section)('ol'):
                for p in ol('li'):
                    if '<b>' in str(p):
                        section_title_p=list(p('b'))[0].text
                        row=parse_section(p,section_title_p,None,page_name)
                        data.append(row)
        elif section_title.startswith('Jobs with 2020') or section_title.startswith('Jobs with 2021'):   # 2020 changed format!?
            for p in bs4.BeautifulSoup(section)('p'):
                if '<b>' in str(p):
                    section_title_p=list(p('b'))[0].text
                    row=parse_section(p,section_title_p,None,page_name)
                    data.append(row)
        else:
            section_dom=bs4.BeautifulSoup(section.split('</span>',1)[-1])
            row=parse_section(section_dom,section_title,now_isTT,page_name)
            data.append(row)
    return [d for d in data if d]

In [None]:
#process_page('Restoration_/_18th_Century_British_2020-2021')

In [None]:
# process_page('French_%26_Francophone_2020-2021')

In [None]:
# process_page('Restoration_/_18th_Century_British_2019-2020')

In [None]:
# process_page('Comparative_Literature_2020-2021')
# process_page('Environmental_Literature_2020-2021')

In [None]:
# # process_page('African_%26_African_American_Studies_2020-2021')
# res=process_page('Early_Modern_/_Renaissance_2020-2021')
# for d in res: print(d['section_title'])

In [None]:
# res=process_page('Spanish_and_Portuguese_2020-2021')
# for d in res: print(d['section_title'])

## Sanity check for 2020 (formatting changed)

Bad pages:
* Early_Modern_/_Renaissance_2020-2021 --> fixed by caching


In [None]:
pages_2020=df_pages[df_pages.year==2020].page
# len(pages_2020)

In [None]:
# ok, looks good for now (11-23-2020)
# uncomment to check:

# for page in pages_2020:
#     try:
#         res=process_page(page)
#         print(page,[d['section_title'] for d in res],'\n')
#     except (IndexError,WikiaError) as e:
#         print(page,'!!!',e)

## Step 4: Gathering all pages' data

In [None]:
# Get all pages' data!!!!
def get_all_data():
    data_ld=[]
    all_pages=list(df_pages.page)
    for i,page in enumerate(tqdm(sorted(all_pages))):
        try:
            page_data = process_page(page)
        except WikiaError as e:
            continue
        if not page_data: continue
        #datadx={**page_data, **{'page':page}}
        for dx in page_data:
            if not dx: continue
            dx['page']=page
            data_ld.append(dx)
    return pd.DataFrame(data_ld)

In [None]:
# Big data crunching step!
df_data=get_all_data()
df_data

## Step 5: Postprocessing

In [None]:
df = df_pages.merge(df_data,on='page') #.merge(df_aliases,on='page_group')

### Sanity checks

In [None]:
df[(df.page_group=='Comparative Literature') & (df.year==2020)]

In [None]:
df.IsUni.value_counts()

In [None]:
#df[df.IsUni==''].section_title

In [None]:
df.IsTT.value_counts()

In [None]:
# df.alias.value_counts().iloc[:5]

In [None]:
df.JobType.value_counts()

In [None]:
date=f'{now.year}-{now.month}-{now.day}'
ofn=f'data.jobcensus.wiki.{date}.csv'
df.to_csv(ofn,index=False)