# MLA Joblist Parsing

In [276]:
EML_DIR='emls'

## Parsing emls

In [277]:
from datetime import datetime
import os,sys,json,pandas as pd
import json
import requests,bs4

In [278]:
def parse_eml0(fn):
    with open(fn, 'rb') as fhdl:
        raw_email = fhdl.read()

    ep = eml_parser.EmlParser()
    parsed_eml = ep.decode_email_bytes(raw_email)
    return parsed_eml

In [279]:
def parse_eml(fn):
    with open(fn) as f: txt=f.read()
    body=txt.split('quoted-printable')[-1].split('----boundary')[0].strip()
    lines=[l.replace('=\n','').strip() for l in body.split('=0D=0A')]# if l.strip()]
    try:
        lines=lines[3:lines.index('Good luck with your job search, From MLA Career Services')]
    except ValueError:
        return []
    
    # parse into jobs
    body2='\n'.join(lines).strip()
    jobs=[job.split('\n') for job in body2.split('\n\n')]
    
    return [
        {
            'title':job[0],
            'link':job[1],
            'desc':job[2]
        }
        for job in jobs
    ]

In [280]:
dat=parse_eml('emls/3 new  jobs.eml[1][1].eml')
# dat

## Get all

In [281]:
jobs=[]
for fn in os.listdir(EML_DIR):
    jobs+=parse_eml(os.path.join(EML_DIR,fn))

In [282]:
len(jobs)

90

In [283]:
df=pd.DataFrame(jobs)
df

Unnamed: 0,title,link,desc
0,Mellon Post-Doctoral Fellowship in Latinx Lite...,https://joblist.mla.org/job-details/2878/mello...,"Williamstown, MA | English | Full-time | Massa..."
1,Early Career Fellowship in Creative Writing: R...,https://joblist.mla.org/job-details/2877/early...,"Austin, TX | English | Full-time | Texas | Cre..."
2,English Education - Assistant Professor,https://joblist.mla.org/job-details/2747/engli...,"Fresno, CA | English | English education | Ful..."
3,Tenure-track Assistant Professor in Rhetoric a...,https://joblist.mla.org/job-details/2745/tenur...,"Boston College, Chestnut Hill Massachusetts 02..."
4,"Assistant Professor, English",https://joblist.mla.org/job-details/2696/assis...,http://employment.govst.edu/postings/5442 | En...
...,...,...,...
85,Assistant Professor(s) of Spanish,https://joblist.mla.org/job-details/2719/assis...,"Williamsport, PA | Cultural studies, Language ..."
86,Assistant Professor in English (Creative Writing),https://joblist.mla.org/job-details/2836/assis...,"St. Louis, MO | English | Full-time | Missouri..."
87,Tenure Track Assistant Professor - Creative Wr...,https://joblist.mla.org/job-details/2694/tenur...,"Davis, California | English | Full-time | Cali..."
88,Instructor in Human Rights,https://joblist.mla.org/job-details/2693/instr...,"Chicago, IL | English | Full-time | Illinois"


## Deduce info

### Deduce job type

In [284]:
def get_jobtype(title,link):
    title=title.lower()
    if 'post-doc' in title or 'postdoc' in title: return 'Postdoc'
    if 'visiting' in title: return 'Non-TT'
    if 'professor' in title or 'open-rank' in title: return 'TT'
    if 'fellowship' in title or 'fellow' in title: return 'Postdoc'
    if 'adjunct' in title or 'part-time' in title: return 'Non-TT'
    if 'director' in title: return 'Non-TT'
    if 'instructor' in title or 'freelance' in title: return 'Non-TT'
    if link: return get_jobtype_from_mla(link)
    return 'Unknown'

In [285]:
def get_jobtype_from_mla(link):
    with requests.get(link) as r: html=r.text
    dom=bs4.BeautifulSoup(html)
    for dl in dom('dl'):
        if 'Position type' in str(dl):
            dd=list(dl('dd'))[0]
            return get_jobtype(dd.text,'')
    return 'Unknown'

In [286]:
# get_jobtype_from_mla('https://joblist.mla.org/job-details/2671/critical-race-studies-racism-and-black-and-afrodiasporic-studies/?JbeBatchId=3D491&JobAlertId=3D9850&utm_source=3DJobAlert&utm_medium=3Demail&utm_campaign=3D2020-09-23')

In [287]:
df['JobType']=[get_jobtype(t,l) for t,l in zip(df.title,df.link)]
df['JobType'].value_counts()

TT         62
Non-TT     16
Postdoc    12
Name: JobType, dtype: int64

### Deduce field

In [288]:
df

Unnamed: 0,title,link,desc,JobType
0,Mellon Post-Doctoral Fellowship in Latinx Lite...,https://joblist.mla.org/job-details/2878/mello...,"Williamstown, MA | English | Full-time | Massa...",Postdoc
1,Early Career Fellowship in Creative Writing: R...,https://joblist.mla.org/job-details/2877/early...,"Austin, TX | English | Full-time | Texas | Cre...",Postdoc
2,English Education - Assistant Professor,https://joblist.mla.org/job-details/2747/engli...,"Fresno, CA | English | English education | Ful...",TT
3,Tenure-track Assistant Professor in Rhetoric a...,https://joblist.mla.org/job-details/2745/tenur...,"Boston College, Chestnut Hill Massachusetts 02...",TT
4,"Assistant Professor, English",https://joblist.mla.org/job-details/2696/assis...,http://employment.govst.edu/postings/5442 | En...,TT
...,...,...,...,...
85,Assistant Professor(s) of Spanish,https://joblist.mla.org/job-details/2719/assis...,"Williamsport, PA | Cultural studies, Language ...",TT
86,Assistant Professor in English (Creative Writing),https://joblist.mla.org/job-details/2836/assis...,"St. Louis, MO | English | Full-time | Missouri...",TT
87,Tenure Track Assistant Professor - Creative Wr...,https://joblist.mla.org/job-details/2694/tenur...,"Davis, California | English | Full-time | Cali...",TT
88,Instructor in Human Rights,https://joblist.mla.org/job-details/2693/instr...,"Chicago, IL | English | Full-time | Illinois",Non-TT


In [289]:
def get_field(title,desc='',link=''):
    title=title.lower()
    title+=' '+desc.lower()
    if 'latinx' in title or 'hispanic' in title: return 'Latinx'
    if 'creative' in title: return 'Creative Writing'
    if 'french' in title: return 'French'
    if 'spanish' in title: return 'Spanish'
    if 'russian' in title: return 'Russian'
    if 'italian' in title: return 'Italian'
    if 'chinese' in title: return 'Chinese'
    if 'arabic' in title: return 'Arabic'
    if ' race ' in title: return 'Critical Race Studies'
    if 'japanese' in title: return 'Japanese'
    if 'portuguese' in title: return 'Portuguese'
    if 'rhetoric' in title or ' writing ' in title: return 'Rhetoric/Composition'
    if 'tesol' in title or 'english education' in title: return 'TESOL'
    if 'afro' in title or 'african' in title: return 'African American'
    if 'asian' in title: return 'Asian'
    if 'poetry' in title or 'poetics' in title: return 'Poetry'
    if 'caribbean' in title: return 'Anglophone'
    if 'medieval' in title: return 'Medieval'
    if 'american sign lang' in title: return 'ASL'
    if 'anglophone' in title or 'postcolonial' in title or 'global' in title or 'world' in title: return 'Anglophone'
    if 'children' in title: return 'Children\'s Lit'
    if 'southern' in title: return 'Southern'
    if 'translation' in title: return 'Translation Studies'
    if 'media studies' in title: return 'Media Studies'
    if ' art ' in title and 'history' in title: return 'Art History'
    if 'professor of humanities' in title: return 'Generalist'
    if '20th/21st century american' in title: return 'C20-C21 American'
    if 'comparative lit' in title: return 'Comparative Literature'
    return 'Unknown'

In [290]:
df['field']=[get_field(t,d,l) for t,d,l in zip(df.title, df.desc, df.link)]
df['field'].value_counts()

Spanish                   10
French                    10
Rhetoric/Composition      10
Creative Writing           9
Anglophone                 9
Unknown                    6
Latinx                     5
TESOL                      4
Chinese                    3
Russian                    3
Medieval                   2
Arabic                     2
ASL                        2
African American           2
Poetry                     2
Media Studies              1
Translation Studies        1
Asian                      1
Italian                    1
Japanese                   1
Portuguese                 1
Comparative Literature     1
Critical Race Studies      1
C20-C21 American           1
Art History                1
Generalist                 1
Name: field, dtype: int64

In [291]:
# dict(df[df.field=='Unknown'].sample(n=1).iloc[0])

In [292]:
# for i,row in df[df.field=='Unknown'].iterrows():
#     print(dict(row))

In [293]:
# Add year
df['year']=2020

## Save

In [294]:
now=datetime.now()
date=f'{now.year}-{now.month}-{now.day}'
ofn=f'data.jobcensus.mla.{date}.csv'
df.to_csv(ofn,index=False)