In [348]:
from bs4 import BeautifulSoup as bts
import urllib.request as ur
from urllib.error import HTTPError
import os
from multiprocessing.pool import ThreadPool
import itertools
import sys
import re
import pandas as pd
import numpy as np
import string
import xlsxwriter
import time
import matplotlib.pyplot as plt
indeed = "https://www.indeed.com"
fName = {'scrape':1,'addToExcel':2,'readFromExcel':3,'extractFromHome':4,'extractPages':5,'processKeywords':6}

def monitor(f):
    def f_(x):
        countx=fName[f.__name__]
        pool = ThreadPool(processes=2)
        t = pool.apply_async(func=f)
        spinner = itertools.cycle("|/~\\")
        while not t.ready():
            sys.stdout.write("\r[%s]: Running %s [%s of 6] %s" % (time.ctime(), f.__name__,countx , next(spinner)))
            sys.stdout.flush()
            time.sleep(.5)
        sys.stdout.write("\n")
    return f_


@monitor
def scrape(value=None):
    itr=1
    profiles = ['business+analyst','data+scientist','data+analyst','quantitative+analyst','quantitative+researcher','product+analyst','business+analyst','business+intelligence+analyst','data+engineer','statistical+analyst']
    locations = ['New+York','Mountain+View','Los+Angeles','San+Jose','San+Francisco','San+Diego','Sacramento','Miami','Atlanta','Chicago']
    profiles = profiles*itr
    locations = locations*itr
    start = 0
    contents = []
    visited = []
    for i in range(len(profiles)):
        if profiles[i] not in visited:
            visited.append(profiles[i])
            start = 0
        else:
            start = start + 10
        profile= profiles[i]
        location= locations[i]
        indeed_home = ur.urlopen('https://www.indeed.com/jobs?q='+profile+'&l='+location+'&start='+str(start))
        soup_home = bts(indeed_home,"html.parser")
        div_home_contents = soup_home.find_all('div', attrs = {'class': 'jobsearch-SerpJobCard'})
        contents.extend(div_home_contents)
    return contents

In [349]:
@monitor
def addToExcel(contents):
    workbook = xlsxwriter.Workbook('Rawdata.xlsx') 
    worksheet = workbook.add_worksheet()
    row = 1
    column = 0
    flag = 0
    for item in contents :
        if flag == 0:
            worksheet.write(0, column, 'Data')
            flag = 1
        worksheet.write(row, column, str(item)) 
        row += 1
    workbook.close()
    return

In [350]:
@monitor
def readFromExcel(value=None):
    df = pd.read_excel('Rawdata.xlsx', sheetname=0) # can also index sheet by name or fetch all sheets
    mylist = df['Data'].tolist()
    return mylist

In [351]:
@monitor
def extractFromHome(contents):
    cols = ['Div_Id','Title','Company','Location','Summary Description','Requirements','Salary','Reviews','Date Posted','URL']
    lst = []
    for content in contents:

        content = bts(content,"html.parser")
        div_id=content.div['id']
       
        div_title_home = content.find('div', attrs = {'class': 'title'})
        title=''
        if div_title_home!=None:
            div_title_home_title = bts(str(div_title_home))
            title = div_title_home_title.a['title']
        else:
            div_title_home = content.find('h2', attrs = {'class': 'title'})
            div_title_home_title = bts(str(div_title_home))
            title = div_title_home_title.a['title']
            
        link_url = indeed+div_title_home_title.a['href']
        if link_url == None:
            link_url = 'Not Available'
        
        div_requirements_home = content.find('div', attrs= {'class': 'jobCardReqContainer'})
        requirements = ''
        if div_requirements_home!=None:
            requirements_t = content.find_all('div', attrs= {'class': 'jobCardReqList'})
            for i in requirements_t:
                requirements = '#' + requirements
                requirements = requirements + i.get_text().strip()
        else: requirements = 'Not Available'
            
        div_company_review_home = content.find('div', attrs = {'class': 'sjcl'})
        span_company = div_company_review_home.find('span', attrs = {'class': 'company'})
        company = span_company.get_text().strip()
        
        
        
        l_flag = 0
        div_location = div_company_review_home.find('span', attrs = {'class': 'location'})
        if div_location != None:
            l_flag = 1
            location = div_location.get_text().strip()
        else:
            location = 'Not Available'
        
        if l_flag == 0:
            div_location = div_company_review_home.find('div', attrs = {'class': 'location'})
            if div_location != None:
                location = div_location.get_text().strip()
            else:
                location = 'Not Available'
            
        span_review = div_company_review_home.find('span', attrs = {'class': 'slNoUnderline'})
        if span_review != None:
            review = span_review.get_text().strip()
        else:
            review = 'Not Available'
        
        div_salary_home = content.find('div', attrs = {'class': 'salarySnippet'})
        if div_salary_home != None:
            span_salary = div_salary_home.find('span', attrs = {'class': 'salary'})
            salary = span_salary.get_text().strip()
        else:
            salary = 'Not Available'
        
        div_summary_description = content.find('div', attrs = {'class': 'summary'})
        if div_summary_description != None:
            summary_description = div_summary_description.get_text().strip()
        else:
            summary_description = 'Not Available'
        
        div_result_link_bar_container_home = content.find('div', attrs = {'class': 'result-link-bar-container'})
        span_post_date = div_result_link_bar_container_home.find('span', attrs = {'class': 'date'})
        if span_post_date != None:
            post_date = span_post_date.get_text().strip()
        else:
            post_date = 'Not Available'
        
        lst.append([div_id,title,company,location,summary_description,requirements,salary,review,post_date,str(link_url)])
    dfa = pd.DataFrame(lst, columns=cols)
    #dfa.index = np.arange(1, len(dfa) + 1)
    dfa.to_csv('records_home.csv',index=False)
    os.remove('Rawdata.xlsx')
    return

In [352]:
@monitor
def extractPages(value=None):
    result=pd.read_csv("records_home.csv")
    temp = []
    for url in result['URL']:
        page_content=None
        if url != None:
            try:
                ideed_page_object = ur.urlopen(url)
                ideed_page = bts(ideed_page_object)
                page_content = ideed_page.find('div', attrs = {'class': 'jobsearch-JobComponent-description'})
            except urllib.error.HTTPError as err:
                print(err.code)
        if page_content!=None:
            page_content = page_content.get_text().strip()
        else:
            page_content = 'Page Content Not Available'
        temp.append(page_content)
      
    result['Description'] = temp
    result.to_csv('records_home.csv',index=False)
    return

In [353]:
@monitor
def processKeywords(value=None):
    csv_data = pd.read_csv("records_home.csv")
    job_data = csv_data['Description']
    keywords = ['Python','Sql','Java','R','Ruby','Matlab','SAS','Hadoop','Hive','Spark','Pig','HBase','Tableau','Spotfire','Alteryx','Excel','MapReduce','DBMS','Database','Database Management','Ruby on Rails']
    key_list = []
    for data in job_data:
        counts = dict()
        for line in data.split("\n"):
            for keyword in keywords:
                regex = "\b"+keyword+"\b"
                if keyword=='Python':
                    reg = re.compile(r"\bpython\b",re.I)
                elif keyword=='Sql':
                    reg = re.compile(r"\bsql\b",re.I)
                elif keyword=='Java':
                    reg = re.compile(r"\bjava\b",re.I)
                elif keyword=='R':
                    reg = re.compile(r"\br\b",re.I)
                elif keyword=='Ruby':
                    reg = re.compile(r"\bruby\b",re.I)
                elif keyword=='Ruby on Rails':
                    reg = re.compile(r"\bruby on rails\b",re.I)
                elif keyword=='Matlab':
                    reg = re.compile(r"\bmatlab\b",re.I)
                elif keyword=='Hive':
                    reg = re.compile(r"\bhive\b",re.I)
                elif keyword=='Spark':
                    reg = re.compile(r"\bspark\b",re.I)
                elif keyword=='Pig':
                    reg = re.compile(r"\bpig\b",re.I)
                elif keyword=='HBase':
                    reg = re.compile(r"\bhbase\b",re.I)
                elif keyword=='Tableau':
                    reg = re.compile(r"\btableau\b",re.I)
                elif keyword=='Spotfire':
                    reg = re.compile(r"\bspotfire\b",re.I)
                elif keyword=='Alteryx':
                    reg = re.compile(r"\balteryx\b",re.I)
                elif keyword=='Excel':
                    reg = re.compile(r"\bexcel\b",re.I)
                elif keyword=='MapReduce':
                    reg = re.compile(r"\bmapreduce\b",re.I)
                elif keyword=='SAS':
                    reg = re.compile(r"\bsas\b",re.I)
                elif keyword=='Hadoop':
                    reg = re.compile(r"\bhadoop\b",re.I)
                elif keyword=='DBMS':
                    reg = re.compile(r"\bdbms\b",re.I)
                elif keyword=='Database':
                    reg = re.compile(r"\bdatabase\b",re.I)
                elif keyword=='Database Management':
                    reg = re.compile(r"\bdatabase management\b",re.I)
                elif keyword=='R Programming':
                    reg = re.compile(r"\br programming\b",re.I)  
                for kw in reg.findall(line):
                    if kw=='ruby' or kw=='ruby on rails':
                        keyword='Ruby'
                    if kw=='dbms' or kw=='database' or kw=='database management':
                        keyword='DBMS'
                    if kw=='R' or kw=='R Programming':
                        keyword='R'
                    counts[keyword] = counts.get(keyword,0) + 1
        key_list.append(counts.copy())
    csv_data['Skills'] = key_list
    csv_data.to_csv('records_home.csv',index=False)
    return

In [354]:
def main():
    val=''
    content = scrape(val)
    addToExcel(content)
    contents = readFromExcel(val)
    extractFromHome(contents)
    extractPages(val)
    processKeywords(val)
    print('CSV File is Ready!')
    return

if __name__ == "__main__":
    main()

[Wed Apr 15 16:38:57 2020]: Running scrape [1 of 6] ~
[Wed Apr 15 16:38:57 2020]: Running addToExcel [2 of 6] |
[Wed Apr 15 16:38:58 2020]: Running readFromExcel [3 of 6] |
[Wed Apr 15 16:38:58 2020]: Running extractFromHome [4 of 6] |
[Wed Apr 15 16:41:10 2020]: Running extractPages [5 of 6] \
[Wed Apr 15 16:41:11 2020]: Running processKeywords [6 of 6] /
CSV File is Ready!
