# Coding Exercise 12

In [15]:
import os
import numpy as np
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')

## Task 1
The following code creates a single pandas dataframe of all the data scraped from May 22, 2022. The rows with missing titles and/or job descriptions are dropped. It also uses 'scacy' to tokenize the job descriptions and reports the number of unique tokens as well as the distribution of token tags.

In [16]:
#using code from take home #1
dfs=[]
headers=['URL','Job Title','Company','Company URL','Company Location','Job Description']
for filename in os.scandir('indeed_scraped_data/job_info_data'):
    if filename.is_file() and '5222022' in filename.path:
        if '.csv' in filename.path:
            data=pd.read_csv(filename.path,delimiter=',')
        elif '.json' in filename.path:
            use=[]
            with open(filename.path) as file:
                data=json.load(file)
                for i in data:
                    for item in data[i]:
                        if(len(item)==6):#only include those with all 6 fields
                            use.append(item)
                data=pd.DataFrame(use)
        data.columns=headers #make headers match
        dfs.append(data)
df=pd.concat(dfs) #combine into a single dataframe
df=df[df["Job Title"].notnull()]
df=df[df["Job Description"].notnull()].reset_index(drop=True)
print(df)

                                                    URL  \
0     https://www.indeed.com/rc/clk?jk=069416ccc58dc...   
1     https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...   
2     https://www.indeed.com/rc/clk?jk=263b35a70c9a8...   
3     https://www.indeed.com/rc/clk?jk=e15dbd7ec34e6...   
4     https://www.indeed.com/rc/clk?jk=d0e4f6dc1721c...   
...                                                 ...   
4798  https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...   
4799  https://www.indeed.com/rc/clk?jk=083fd9632b6d3...   
4800  https://www.indeed.com/rc/clk?jk=812033258b28a...   
4801  https://www.indeed.com/rc/clk?jk=a9c6193cec392...   
4802  https://www.indeed.com/rc/clk?jk=6f4a281dca3ad...   

                                              Job Title  \
0               Maintenance Controller (A&P) Technician   
1                                       Program Manager   
2              Senior Hydrogeologist, Boulder, Colorado   
3                         Contract Administration Mgr I

In [20]:
##loop through each job and grab the job description to tokenize them
unique=set()
tag={}
for i in df.index:
    info=df.iloc[i]['Job Description']
    for token in nlp(info):
        unique.add(token.text)
        tag[token.text]=token.pos_
        
print('Number of unique tokens:',len(unique))

token_df=pd.DataFrame(columns=['Token','Tag'])
token_df['Token']=tag.keys()
token_df['Tag']=tag.values()
print('The distribution of tags among them:')
print(token_df['Tag'].value_counts())

Number of unique tokens: 39150
The distribution of tags among them:
PROPN    15204
NOUN     11208
VERB      4613
ADJ       3267
NUM       3206
ADV        819
X          198
ADP        142
PRON       112
PUNCT       83
SPACE       77
AUX         65
SCONJ       48
DET         45
INTJ        22
CCONJ       18
PART        14
SYM          9
Name: Tag, dtype: int64


## Task 2
The following code constructs a token/term frequency dictionary and saves the results in a json file. It also reports the top 10 most common tokens.

In [28]:
freq={}
for i in df.index:
    info=df.iloc[i]['Job Description']
    for token in nlp(info):
        if str(token) in freq:
            freq[str(token)]+=1
        else:
            freq[str(token)]=1

In [32]:
import json
with open('exercise12.json','w') as outfile:
    json.dump(freq,outfile)

df=pd.DataFrame(columns=['Token','Frequency'])
df['Token']=freq.keys()
df['Frequency']=freq.values()
df=df.sort_values('Frequency',ascending=False).reset_index(drop=True)
print(df[:10])

  Token  Frequency
0     ,     254381
1   and     240686
2    \n     227694
3     .     172173
4    to     119440
5   the     100408
6    of      97891
7    in      60798
8  with      50655
9     a      49613
