In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import datetime
import os.path

from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords

In [2]:
class JobSearch:
    global PAGE, HEADERS, NEXT_PAGE_VAL, stopLex
    PAGE = 'https://www.indeed.com/jobs?'
    HEADERS ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' }
    stopLex = set(stopwords.words('english'))
    NEXT_PAGE_VAL = 10
    searchCriteria = {}
    jobData = []
     
    
    def __init__(self, q, l, explvl, sort, numPages = 1):
        self.searchCriteria['q'] = q
        self.searchCriteria['l'] = l
        self.searchCriteria['explvl'] = explvl
        self.searchCriteria['sort'] = sort
        self.searchCriteria['start'] = 0
        self.numPages = numPages 
        
    
    def getJobDetails(self,link):
        html = None
        skillDict = {'python':0, 'sql':0, 'masters':0}
        keyWords = {}
        exp = ['year','years']
        nos = ['1','2','3','4','5','6','7','8','9','10','one','two','three','four','five','six','seven','eight','nine','ten']
        results = []
        title, loc, comp = 'NA', 'NA', 'NA'
    
        response = requests.get(link,headers = HEADERS)
        html = response.content # get the html
        
        if html:soup = BeautifulSoup(html, 'html.parser')
        else:return
    
        titleChunk = soup.find('b',{'class':'jobtitle'})
        if titleChunk: 
            title = titleChunk.text.encode('ascii','ignore')
                
        compChunk = soup.find('span',{'class':'company'})
        if compChunk: comp = compChunk.text.encode('ascii','ignore')
        
        locChunk = soup.find('span',{'class':'location'})
        if locChunk: loc = locChunk.text.encode('ascii','ignore')
        
        summary = soup.find('span', {'class':'summary'})
        text = summary.text.encode('ascii','ignore')
    
        text = text.lower().strip()
        text = re.sub('[^a-z0-9]',' ',text) # clean
    
        unfiltered_tokens = word_tokenize(text)
        tokens = [token for token in unfiltered_tokens if token not in stopLex and token != ' ']
    
        fourgrams = ngrams(tokens,4)
        experience = [gram for gram in fourgrams if gram[3] in exp and (gram[0] in nos or gram[1] in nos or gram[2] in nos)]
    
        for word in tokens:
                if word in skillDict:skillDict[word]+=1
                elif word in keyWords: keyWords[word]+=1
                else: keyWords[word]=1
    
        top3keys = [ i[0] for i in sorted(keyWords.items(), key = lambda x: x[1], reverse = True)[0:3]]
        results = [title, comp, loc, skillDict['python'], skillDict['sql'], skillDict['masters'], experience, top3keys, link ]
        
        return results
   
    def getJobData(self):
        for i in range( self.numPages ):
            html = None
            response = requests.get(PAGE, headers = HEADERS, params = self.searchCriteria)
            html = response.content # get the html

            if html: 
                if i == 0:print 'Success in getting main page...',response.url       
                else: print 'Success in getting page %d...%s' %(i,response.url)
                self.searchCriteria['start']+=NEXT_PAGE_VAL
                
            else:
                if i == 0:print 'Failure in getting main page...',response.url
                else:print 'Failure in getting page %d...%s' %(i,response.url)   
                continue
        
            soup = BeautifulSoup(html, 'html.parser') 
            jobs = soup.findAll('a',{'data-tn-element':'jobTitle'})
            print "Number of jobs parsed: ",len(jobs)
    
            for job in jobs:
                link = "http://www.indeed.com" + job.get('href')
                if link:
                    self.jobData.append(self.getJobDetails(link))
     
    def makeJobFile(self,outputLoc):
        df = pd.DataFrame(self.jobData)
        print df.shape
        df.columns = ['JobTitle', 'Company', 'Location', 'Python', 'SQL', 'Masters', 'Experience', 'Keywords', 'JobLink']
        now = datetime.datetime.now()
        filename = now.strftime("%B")+ str(now.day) + self.searchCriteria['l']
        output = outputLoc + filename +'.csv'

        if os.path.exists(output):
            choice = raw_input('File already exists..Do you want to replace[y/n]')
            if choice == 'y':
                df.to_csv(output, index = 'False')
                print 'Output file written to ',output
            else:
                print 'File not overwritten'
            
        else:
            df.to_csv(output, index = 'False') 
            print 'Output file written to ',output
    
    def printData(self):
        print 'The Search criteria is \n',self.searchCriteria
        print 'The user requested for {s} page{plural_s} '.format(s=self.numPages,plural_s=('' if self.numPages == 1 else 's'))
                      
j1 = JobSearch('Data Analyst','NewJersey','entry_level','date',3) 
j1.printData()
j1.getJobData()
output_loc = 'C:\\Users\\AnalyticsGirl\\'
j1.makeJobFile(output_loc)

The Search criteria is 
{'q': 'Data Analyst', 'sort': 'date', 'l': 'NewJersey', 'start': 0, 'explvl': 'entry_level'}
The user requested for 3 pages 
Success in getting main page... https://www.indeed.com/jobs?q=Data+Analyst&sort=date&l=NewJersey&start=0&explvl=entry_level
Number of jobs parsed:  13
Success in getting page 1...https://www.indeed.com/jobs?q=Data+Analyst&sort=date&l=NewJersey&start=10&explvl=entry_level
Number of jobs parsed:  15
Success in getting page 2...https://www.indeed.com/jobs?q=Data+Analyst&sort=date&l=NewJersey&start=20&explvl=entry_level
Number of jobs parsed:  15
(43, 9)
File already exists..Do you want to replace[y/n]y
Output file written to  C:\Users\AnalyticsGirl\April12NewJersey.csv
