In [1]:
from bs4 import BeautifulSoup # For HTML parsing
from urllib.request import urlopen # Website connections
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
%matplotlib inline

In [2]:
def text_cleaner(website):
    '''
    This function just cleans up the raw html so that I can look at it.
    Inputs: a URL to investigate
    Outputs: Cleaned text only
    '''
    try:
        site = urlopen(website).read() # Connect to the job posting
    except: 
        return   # Need this in case the website isn't there anymore or some other weird connection problem 
    
    soup_obj = BeautifulSoup(site) # Get the html from the site
    
    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
    
    

    text = soup_obj.get_text() # Get the text from this
    
        
    
    lines = (line.strip() for line in text.splitlines()) # break into lines
    
        
        
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    
    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out  
        
    
    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line
        
        
    # Now clean out all of the unicode junk (this line works great!!!)
        
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception
       
        
    text = re.sub("[^a-zA-Z.+3]"," ", str(text))  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                                # Also include + for C++
        
       
    text = text.lower().split()  # Go to lower case and split them apart
        
        
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if not w in stop_words]
        
        
        
    text = list(set(text)) # Last, just get the set of these. Ignore counts (we are just looking at whether a term existed
                            # or not on the website)
        
    return text

In [6]:
sample = text_cleaner('http://www.indeed.com/viewjob?jk=5505e59f8e5a32a4&q=%22data+scientist%22&tk=19ftfgsmj19ti0l3&from=web&advn=1855944161169178&sjdu=QwrRXKrqZ3CNX5W-O9jEvWC1RT2wMYkGnZrqGdrncbKqQ7uwTLXzT1_ME9WQ4M-7om7mrHAlvyJT8cA_14IV5w&pub=pub-indeed')



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [3]:
final_job = 'data+scientist' # searching for data scientist exact fit("data scientist" on Indeed search)

final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

final_site = ''.join(final_site_list) # Merge the html address together into one string

print (final_site)
base_url = 'http://www.indeed.com'

http://www.indeed.com/jobs?q="data+scientist"


In [6]:
html = urlopen(final_site).read()
soup = BeautifulSoup(html, "lxml")

In [7]:
num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found

In [8]:
num_jobs_area

b'Jobs 1 to 10 of 3,368'

In [9]:
job_numbers = re.findall('\d+', str(num_jobs_area)) # Extract the total jobs found from the search result

In [10]:
if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
    total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
else:
    total_num_jobs = int(job_numbers[2]) 

In [11]:
total_num_jobs

3368

In [12]:
num_pages = total_num_jobs/10

In [13]:
job_descriptions1 = []

In [None]:
for i in range(1,50): # Loop through all of our search result pages
    print ('Getting page', i)
    start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
    current_page = ''.join([final_site, '&start=', start_num])
    # Now that we can view the correct 10 job returns, start collecting the text samples from each
        
    html_page = urlopen(current_page).read() # Get the page
        
    page_obj = BeautifulSoup(html_page, "lxml") # Locate all of the job links
    job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist
        
    job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a') if link.get('href') is not None] # Get the URLS for the jobs
        
    job_URLS = list(filter(lambda x:'clk' in x, job_URLS)) # Now get just the job related URLS
    for j in range(0,len(job_URLS)):
        final_description = text_cleaner(job_URLS[j])
        if final_description: # So that we only append when the website was accessed correctly
            job_descriptions1.append(final_description)
            #sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot! 
        #print ('Done with collecting the job postings!')  

Getting page 1


0

In [34]:
folder="C:\\Users\\Sriram\\Desktop\\"
text=""
for i in range(1,7):
    f = open(folder+"indeed-"+str(i)+".csv")
    for row in f:
        text+=row.replace("\n"," ")

In [36]:
len(text.split(" "))

231313

In [3]:
import nltk
raw = f.read()

tokens = nltk.word_tokenize(raw)

In [4]:
bgs = nltk.bigrams(tokens)

In [5]:
fdist = nltk.FreqDist(bgs)

In [8]:
for k,v in fdist.items():
    print (k,v)

In [9]:
raw

''

In [10]:
f

<_io.TextIOWrapper name='C:\\Users\\Sriram\\Desktop\\indeed-1.csv' mode='r' encoding='cp1252'>

In [38]:
import nltk
from nltk import bigrams
from nltk import trigrams

In [39]:
# split the texts into tokens
tokens = nltk.word_tokenize(text)
tokens = [token.lower() for token in tokens if len(token) > 1] #same as unigrams
bi_tokens = bigrams(tokens)
tri_tokens = trigrams(tokens)

In [41]:
import operator


unigramsDict = {}
for token in tokens:
  if token not in unigramsDict:
    unigramsDict[token] = 1
  else:
    unigramsDict[token] += 1
bigramsDict = {}
for token in bi_tokens:
  if token not in bigramsDict:
    bigramsDict[token] = 1
  else:
    bigramsDict[token] += 1
trigramsDict = {}
for token in tri_tokens:
  if token not in trigramsDict:
    trigramsDict[token] = 1
  else:
    trigramsDict[token] += 1
sorted_unigrams = sorted(unigramsDict.items(), key=operator.itemgetter(1),reverse=True)
sorted_bigrams = sorted(bigramsDict.items(), key=operator.itemgetter(1),reverse=True)
sorted_trigrams = sorted(trigramsDict.items(), key=operator.itemgetter(1),reverse=True)

In [46]:
for token in bi_tokens:
    print (token)

In [51]:
uniFile=open(folder+"unigrams.csv","w")
for word in sorted_unigrams:
    uniFile.write(word[0]+","+str(word[1])+"\n")

uniFile.close()