My first concern is finding the "apply" link on all the relevant pages. There's substantial heterogeneity in what the links look like, both visually and in the HTML.

What I'm wondering is if we can come up with a selector that will correctly find the apply button on "enough" of the pages that we can just ignore the rest. Another possibility is that the links themselves are consistent enough that we can just look at all `a`s and do it based on `href` value. Let's have a look.

In [36]:
import pandas as pd
from lxml import html
import requests

In [37]:
df = pd.read_csv("dataset.csv")

In [111]:
# just for faster exploration
small_df = df[1:100]
small_df
full_df = df
df = small_df

First, we need to fetch the actual HTML of the page for each job.

In [112]:
resps = [requests.get(link) for link in df.link]

In [134]:
def has_monster_apply_button(page):
    "as opposed to an external apply button"
    maybe_btn = page.cssselect(".btn-apply")
    if maybe_btn:
        btn = maybe_btn[0]
        return bool(btn.get("href"))
    else:
        return False

In [135]:
df.loc[:,"page"] = [r.text for r in resps]
df.loc[:,"actual_link"] = [r.request.path_url for r in resps]
# a posting is usable if:
#  1. its part of the "v2" system on monster, which has a more standard, reasonable page structure
#  2. its not expired
#  3. its of the sort you can apply to on monster, and not an external system 
df.loc[:,"usable"] = (df["actual_link"].str.contains("v2") &
                      ~df["actual_link"].str.contains("Expired") &
                      [has_monster_apply_button(html.fromstring(p)) for p in df["page"]])



In [136]:
df

Unnamed: 0,city,applied?,full,addresses,title,company,resumes,colleges,link,location,lastnames,firstnames,type,original_link,zipcodes,actual_link,page,usable
2,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[1, 12, 4, 8]",\n\nSeasonal Holiday Retail Sales Jewelry Cler...,\nMacys\n,"[1, 3, 0, 2]","[1, 0, 2, 3]",http://jobview.monster.com/seasonal-holiday-re...,"\n\n\nWayne\n ,NJ\n\n","[1, 0, 6, 4]","[4, 6, 3, 9]",clerical,http://jobview.monster.com/seasonal-holiday-re...,"[10, 3, 13, 11]",/seasonal-holiday-retail-sales-jewelry-clerica...,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",False
3,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[7, 6, 12, 17]",\n\nLaboratory Clerical Associate\n\n,\nSovereign Medical Services\n,"[5, 2, 1, 4]","[3, 0, 2, 1]",http://jobview.monster.com/laboratory-clerical...,"\n\n\nFair Lawn\n ,NJ\n\n\n","[2, 6, 7, 9]","[6, 3, 7, 5]",clerical,http://jobview.monster.com/laboratory-clerical...,"[14, 10, 0, 1]",/v2/job/View?JobID=174742599,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,False
4,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[0, 15, 11, 14]",\n\nTemporary Clerical professionals\n\n,\nCompany Confidential\n,"[6, 3, 5, 1]","[2, 3, 0, 1]",http://jobview.monster.com/temporary-clerical-...,"\n\n\nBrooklyn\n ,NY\n\n","[3, 8, 9, 4]","[0, 1, 6, 7]",clerical,http://jobview.monster.com/temporary-clerical-...,"[13, 12, 8, 0]",/v2/job/View?JobID=173095192,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
5,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[2, 6, 11, 13]",\n\nAttendance Coordinator -(Clerical) part ti...,"\nPremier Home Health Care Services, Inc\n","[5, 4, 6, 0]","[3, 2, 0, 1]",http://jobview.monster.com/attendance-coordina...,"\n\n\nNew York City\n ,NY\n\n\n","[1, 4, 5, 0]","[1, 2, 0, 6]",clerical,http://jobview.monster.com/attendance-coordina...,"[7, 16, 2, 12]",/v2/job/View?JobID=175551323,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
6,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[10, 13, 9, 1]",\n\nOffice/Bookkeeper Clerk\n\n,\nCiampa Management\n,"[4, 3, 6, 0]","[1, 0, 3, 2]",http://jobview.monster.com/office-bookkeeper-c...,"\n\n\nFlushing\n ,NY\n\n\n","[7, 8, 5, 3]","[0, 9, 8, 7]",clerical,http://jobview.monster.com/office-bookkeeper-c...,"[2, 4, 6, 7]",/v2/job/View?JobID=175927606,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
7,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[4, 17, 14, 13]",\n\nOffice Clerk\n\n,"\nInter American Cosmetics, Inc.\n","[7, 2, 0, 6]","[3, 0, 2, 1]",http://jobview.monster.com/office-clerk-job-lo...,"\n\n\nLodi\n ,NJ\n\n\n","[8, 0, 1, 7]","[3, 6, 9, 0]",clerical,http://jobview.monster.com/office-clerk-job-lo...,"[18, 14, 0, 3]",/v2/job/View?JobID=172842490,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
8,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[17, 7, 9, 8]",\n\nBD-13478 - Accounting Clerk 2 Position in ...,"\nPROGRAMMER RESOURCES INTERNATIONAL, INC\n","[6, 1, 3, 0]","[0, 2, 1, 3]",http://jobview.monster.com/bd-13478-accounting...,"\n\n\nFranklin Lakes\n ,NJ\n\n\n","[1, 4, 3, 7]","[9, 3, 0, 2]",clerical,http://jobview.monster.com/bd-13478-accounting...,"[13, 6, 12, 9]",/v2/job/View?JobID=175740073,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
9,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[2, 9, 17, 11]",\n\nBuyer Clerical\n\n,"\nRainbow USA, Inc\n","[7, 3, 0, 6]","[3, 2, 0, 1]",http://jobview.monster.com/buyer-clerical-job-...,"\n\n\nBrooklyn\n ,NY\n\n\n","[7, 8, 5, 9]","[5, 7, 0, 9]",clerical,http://jobview.monster.com/buyer-clerical-job-...,"[2, 10, 6, 16]",/v2/job/View?JobID=175788974,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
10,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[12, 0, 14, 13]",\n\nAdministrative/Clerical Assistant AA2136-1...,\nThe Cambridge Don\n,"[2, 6, 4, 5]","[1, 0, 2, 3]",http://jobview.monster.com/administrative-cler...,"\n\n\nNew York City\n ,NY\n\n","[8, 6, 3, 0]","[7, 2, 5, 6]",clerical,http://jobview.monster.com/administrative-cler...,"[7, 4, 9, 10]",/v2/job/Expired?JobID=175391943,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,False
11,New York,0,"<article class=""js_result_row"" itemscope="""" it...","[1, 5, 15, 14]",\n\nCourt Clerk\n\n,\nMeridian Legal Search\n,"[0, 2, 3, 6]","[0, 3, 1, 2]",http://jobview.monster.com/court-clerk-job-new...,"\n\n\nNew York City\n ,NY\n\n\n","[9, 0, 6, 4]","[3, 2, 8, 4]",clerical,http://jobview.monster.com/court-clerk-job-new...,"[5, 0, 17, 13]",/v2/job/View?JobID=174845421,<!DOCTYPE html>\r\n\r\n\r\n\r\n<html>\r\n<head...,True
