My first concern is finding the "apply" link on all the relevant pages. There's substantial heterogeneity in what the links look like, both visually and in the HTML.

What I'm wondering is if we can come up with a selector that will correctly find the apply button on "enough" of the pages that we can just ignore the rest. Another possibility is that the links themselves are consistent enough that we can just look at all `a`s and do it based on `href` value. Let's have a look.

In [8]:
import pandas as pd
from lxml import html
import requests

In [9]:
df = pd.read_csv("dataset.csv")

In [10]:
# just for faster exploration
small_df = df[1:100]
small_df
full_df = df
df = small_df

First, we need to fetch the actual HTML of the page for each job.

In [112]:
resps = [requests.get(link) for link in df.link]

In [134]:
def has_monster_apply_button(page):
    "as opposed to an external apply button"
    maybe_btn = page.cssselect(".btn-apply")
    if maybe_btn:
        btn = maybe_btn[0]
        return bool(btn.get("href"))
    else:
        return False

In [135]:
df.loc[:,"page"] = [r.text for r in resps]
df.loc[:,"actual_link"] = [r.request.path_url for r in resps]
# a posting is usable if:
#  1. its part of the "v2" system on monster, which has a more standard, reasonable page structure
#  2. its not expired
#  3. its of the sort you can apply to on monster, and not an external system 
df.loc[:,"usable"] = (df["actual_link"].str.contains("v2") &
                      ~df["actual_link"].str.contains("Expired") &
                      [has_monster_apply_button(html.fromstring(p)) for p in df["page"]])



In [136]:
df

Unnamed: 0,city,applied?,full,addresses,title,company,resumes,colleges,link,location,lastnames,firstnames,type,original_link,zipcodes,actual_link,page,usable
2,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[1, 12, 4, 8]",\n\nSeasonal Holiday Retail Sales Jewelry Cler...,\nMacys\n,"[1, 3, 0, 2]","[1, 0, 2, 3]",http://jobview.monster.com/seasonal-holiday-re...,"\n\n\nWayne\n ,NJ\n\n","[1, 0, 6, 4]","[4, 6, 3, 9]",clerical,http://jobview.monster.com/seasonal-holiday-re...,"[10, 3, 13, 11]",/seasonal-holiday-retail-sales-jewelry-clerica...,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",False
3,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[7, 6, 12, 17]",\n\nLaboratory Clerical Associate\n\n,\nSovereign Medical Services\n,"[5, 2, 1, 4]","[3, 0, 2, 1]",http://jobview.monster.com/laboratory-clerical...,"\n\n\nFair Lawn\n ,NJ\n\n\n","[2, 6, 7, 9]","[6, 3, 7, 5]",clerical,http://jobview.monster.com/laboratory-clerical...,"[14, 10, 0, 1]",/v2/job/View?JobID=174742599,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,False
4,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[0, 15, 11, 14]",\n\nTemporary Clerical professionals\n\n,\nCompany Confidential\n,"[6, 3, 5, 1]","[2, 3, 0, 1]",http://jobview.monster.com/temporary-clerical-...,"\n\n\nBrooklyn\n ,NY\n\n","[3, 8, 9, 4]","[0, 1, 6, 7]",clerical,http://jobview.monster.com/temporary-clerical-...,"[13, 12, 8, 0]",/v2/job/View?JobID=173095192,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
5,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[2, 6, 11, 13]",\n\nAttendance Coordinator -(Clerical) part ti...,"\nPremier Home Health Care Services, Inc\n","[5, 4, 6, 0]","[3, 2, 0, 1]",http://jobview.monster.com/attendance-coordina...,"\n\n\nNew York City\n ,NY\n\n\n","[1, 4, 5, 0]","[1, 2, 0, 6]",clerical,http://jobview.monster.com/attendance-coordina...,"[7, 16, 2, 12]",/v2/job/View?JobID=175551323,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
6,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[10, 13, 9, 1]",\n\nOffice/Bookkeeper Clerk\n\n,\nCiampa Management\n,"[4, 3, 6, 0]","[1, 0, 3, 2]",http://jobview.monster.com/office-bookkeeper-c...,"\n\n\nFlushing\n ,NY\n\n\n","[7, 8, 5, 3]","[0, 9, 8, 7]",clerical,http://jobview.monster.com/office-bookkeeper-c...,"[2, 4, 6, 7]",/v2/job/View?JobID=175927606,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
7,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[4, 17, 14, 13]",\n\nOffice Clerk\n\n,"\nInter American Cosmetics, Inc.\n","[7, 2, 0, 6]","[3, 0, 2, 1]",http://jobview.monster.com/office-clerk-job-lo...,"\n\n\nLodi\n ,NJ\n\n\n","[8, 0, 1, 7]","[3, 6, 9, 0]",clerical,http://jobview.monster.com/office-clerk-job-lo...,"[18, 14, 0, 3]",/v2/job/View?JobID=172842490,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
8,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[17, 7, 9, 8]",\n\nBD-13478 - Accounting Clerk 2 Position in ...,"\nPROGRAMMER RESOURCES INTERNATIONAL, INC\n","[6, 1, 3, 0]","[0, 2, 1, 3]",http://jobview.monster.com/bd-13478-accounting...,"\n\n\nFranklin Lakes\n ,NJ\n\n\n","[1, 4, 3, 7]","[9, 3, 0, 2]",clerical,http://jobview.monster.com/bd-13478-accounting...,"[13, 6, 12, 9]",/v2/job/View?JobID=175740073,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
9,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[2, 9, 17, 11]",\n\nBuyer Clerical\n\n,"\nRainbow USA, Inc\n","[7, 3, 0, 6]","[3, 2, 0, 1]",http://jobview.monster.com/buyer-clerical-job-...,"\n\n\nBrooklyn\n ,NY\n\n\n","[7, 8, 5, 9]","[5, 7, 0, 9]",clerical,http://jobview.monster.com/buyer-clerical-job-...,"[2, 10, 6, 16]",/v2/job/View?JobID=175788974,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
10,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[12, 0, 14, 13]",\n\nAdministrative/Clerical Assistant AA2136-1...,\nThe Cambridge Don\n,"[2, 6, 4, 5]","[1, 0, 2, 3]",http://jobview.monster.com/administrative-cler...,"\n\n\nNew York City\n ,NY\n\n","[8, 6, 3, 0]","[7, 2, 5, 6]",clerical,http://jobview.monster.com/administrative-cler...,"[7, 4, 9, 10]",/v2/job/Expired?JobID=175391943,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,False
11,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[1, 5, 15, 14]",\n\nCourt Clerk\n\n,\nMeridian Legal Search\n,"[0, 2, 3, 6]","[0, 3, 1, 2]",http://jobview.monster.com/court-clerk-job-new...,"\n\n\nNew York City\n ,NY\n\n\n","[9, 0, 6, 4]","[3, 2, 8, 4]",clerical,http://jobview.monster.com/court-clerk-job-new...,"[5, 0, 17, 13]",/v2/job/View?JobID=174845421,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True


Let's try to get applying to a single posting working

In [46]:
import requests
from requests import Request, Session
import re

# probably should start using this eventually to avoid detection
FIREFOX_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1'

In [19]:
base_url = "http://jobview.monster.com"
path = "/v2/job/View?JobID=112321888"
url = base_url + re.sub("View", "Apply", path)
url

'http://jobview.monster.com/v2/job/Apply?JobID=112321888'

In [109]:
s = Session()
s.headers.update({'User-Agent': FIREFOX_USER_AGENT})
resp = s.get(url)

In [110]:
# extract the verification token that monster uses to validate the request
page = html.fromstring(resp.text)
hidden_token_input = page.cssselect('input[name="__RequestVerificationToken"]')[0]
verif_token = hidden_token_input.value

In [130]:
# first we need to submit a resume
submit_url = "http://jobview.monster.com/v2/apply/Upload"
files = {"Attachments": ('testing.pdf', open('testing.pdf', 'rb'), 'application/pdf', {})}
req = Request('POST', submit_url, files=files)
preped = req.prepare()
print preped.body

--759648a209d14d33a1bb02178f00da57
Content-Disposition: form-data; name="Attachments"; filename="testing.pdf"
Content-Type: application/pdf

%PDF-1.3
%���������
4 0 obj
<< /Length 5 0 R /Filter /FlateDecode >>
stream
x]�1�0�w~����+��U��Fr��q"�&����mrý|��];���&�|z�0�<B@�Bi���1+�1o��CΪe,���Uʢ�Z9�OM5b�㓤��H��ŉA+]{���%<��2��8he �W"��F�E������|Ņ���;J
endstream
endobj
5 0 obj
169
endobj
2 0 obj
<< /Type /Page /Parent 3 0 R /Resources 6 0 R /Contents 4 0 R /MediaBox [0 0 612 792]
>>
endobj
6 0 obj
<< /ProcSet [ /PDF /Text ] /ColorSpace << /Cs1 7 0 R >> /Font << /TT1 8 0 R
>> >>
endobj
9 0 obj
<< /Length 10 0 R /N 3 /Alternate /DeviceRGB /Filter /FlateDecode >>
stream
x��wTS��Ͻ7��" %�z	 �;HQ�I�P��&vDF)VdT�G�"cE��b�	�P��QDE�݌k	�5�ޚ��Y�����g�}׺ P���tX�4�X���\���X��ffG�D���=���HƳ��.�d��,�P&s���"7C$ 
E�6<~&��S��2���)2�12�	��"�įl���+�ɘ�&�Y��4���Pޚ%ᣌ�\�%�g�|e�TI� ��(����L 0�_��&�l�2E�� ��9�r��9h� x�g��Ib�טi���f��S�b1+��M

In [112]:
resp = s.send(preped)
assert resp.ok
resume_json = resp.json()
print resume_json
resume_ref = resume_json["rValue"]

{u'title': u'testing.pdf', u'rValue': u'EAAQ3l7skbh_LLx.wARpxf1.JVkQsLP6wj0z.iK2Ou1XRhERILM8NXm1FZpN_fUiFDjcTcJnstf9NlSGX6X_0fHXpA--', u'success': True, u'dateTime': u'11/26/2016'}


Now that we've submitted a resume, we need to take that id and use it in submitting an actual application

In [131]:
# the url to POST to to apply is the same one to GET to view the application form
# NOTE we will eventually need to add detection and handling for the mandatory
# "diversity candidate" thingy section that appears in some applications
data = {
    "__RequestVerificationToken": verif_token,
    "Pi.FirstName": "Foo",
    "Pi.LastName": "Bar",
    "Pi.CareerLevelId": "12",
    "Pi.EducationLevelId": "5",
    "Pi.EmailAddress": "monstertesting4@jamesporter.me",
    "Pi.CountryId": "164",
    "Pi.UserEnteredGeoName": "14001, Akron, NY",
    "Pi.UserEnteredGeoNameTranslated": "14001, Akron, NY",
    "Pi.RefGeoId": "7179994",
    "Pi.WorkAuthorizationStatus": "1",
    "Pi.IsInEditMode": "true",
    "Rs.ResumeValue": resume_ref,
    "Rs.SearchableMember": "false",
    "Rs.DiversityMember": "false",
    "rs.TmpResumes[0].FilePath": resume_ref,
    "rs.TmpResumes[0].Title": "testing.pdf",
    "rs.TmpResumes[0].IsPublic": "false",
    "rs.TmpResumes[0].IsDiversityActive": "false",
    "CoverLetter.CoverLetterBody": "",
    "CoverLetter.SaveCoverLetter": "false",
    "CoverLetter.CoverLetterTitle": "",
}

form_data = [(key, (None, val)) for key, val in data.iteritems()]
form_data.append(("Attachments", ("", "", "application/octet-stream")))
req = Request(
    "POST", url,
    files=form_data
)
preped = req.prepare()
# do a bit of finagling to the body
print preped.body

--f4bb51f5c46d48c786795f87e0894b4d
Content-Disposition: form-data; name="Pi.CareerLevelId"

12
--f4bb51f5c46d48c786795f87e0894b4d
Content-Disposition: form-data; name="CoverLetter.SaveCoverLetter"

false
--f4bb51f5c46d48c786795f87e0894b4d
Content-Disposition: form-data; name="Pi.WorkAuthorizationStatus"

1
--f4bb51f5c46d48c786795f87e0894b4d
Content-Disposition: form-data; name="rs.TmpResumes[0].Title"

testing.pdf
--f4bb51f5c46d48c786795f87e0894b4d
Content-Disposition: form-data; name="Rs.SearchableMember"

false
--f4bb51f5c46d48c786795f87e0894b4d
Content-Disposition: form-data; name="CoverLetter.CoverLetterTitle"


--f4bb51f5c46d48c786795f87e0894b4d
Content-Disposition: form-data; name="Rs.DiversityMember"

false
--f4bb51f5c46d48c786795f87e0894b4d
Content-Disposition: form-data; name="__RequestVerificationToken"

j7SDYYfF_sosoSsCXPMbIIB_0KUwzwSdKo9ah9q6wvzkumeOcq36weoH9Uw5b5bC9MVyX5wexnIbvwNOZXYJa6-SSFY3686kBfnAaBkc6kOsl0A75NOAs2FM59Y1
--f4bb51f5c46d48c

In [114]:
resp = s.send(preped)

In [122]:
s.cookies

<RequestsCookieJar[Cookie(version=0, name='ApplyCtx', value='1=112321888&28=2&29=4509&35=Accounts+Payable+Clerk&48=&49=&59=&50=&51=Springfield%2c+NJ&52=Nauticus+Group&56=&65=121279623&69=1_0_0&72=JSON&73=0&77=False', port=None, port_specified=False, domain='.monster.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False), Cookie(version=0, name='__RequestVerificationToken_L3Yy0', value='tu9ik5eX-ytLuewMsBELCkXJ_o9oc-HhAF8PYbVL5pTpk548wKCOShXFmtM1Ezkqv0YOlPrwtiyHyrFowV2Yh5jUiKsNcWYY-MylDqWZWy9YknHV_QNhKPTKejo1', port=None, port_specified=False, domain='jobview.monster.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False), Cookie(version=0, name='atmResolver', value='|Seeker|jobview|58|164|', port=None,