In [1]:
import pandas as pd
import re

In [2]:
# Combine the new and cumulative data sets
cumulative_job_posting_df = pd.read_csv("../Data/Job_posting_data_cumulative.csv")
new_job_posting_df = pd.read_csv("../Data/Job_posting_data_current.csv")

total_job_posting_df = pd.concat([cumulative_job_posting_df, new_job_posting_df])

# Show the new cumulative data set
print(total_job_posting_df)

                                             Job Title  \
0                           route sales representative   
1                                ultrasound technician   
2                     optometric ophthalmic technician   
3                                           controller   
4                                      account manager   
..                                                 ...   
850                                  Warehouse Manager   
851                                    Delivery Driver   
852  Executive Assistant/Administrative Specialist ...   
853                       NUTRITIONIST-HEALTH EDUCATOR   
854        Access Control Officer (Part-Time) - Savina   

                              Company Name  \
0                  frito lay north america   
1                    proximity diagnostics   
2                 imperial beach optometry   
3                                 filtrous   
4                            aya corporate   
..                         

In [3]:
# Drop rows with blank "job title" or "company name"

total_job_posting_df.dropna(subset=['Job Title', 'Company Name'], inplace=True)

In [4]:
# Format the "job title" and "company name" columns
total_job_posting_df['Job Title'] = total_job_posting_df['Job Title'].str.replace('/', ' ')
total_job_posting_df['Job Title'] = total_job_posting_df['Job Title'].str.replace('-', ' ')
total_job_posting_df['Job Title'] = total_job_posting_df['Job Title'].str.lower()

total_job_posting_df['Company Name'] = total_job_posting_df['Company Name'].str.replace('/', ' ')
total_job_posting_df['Company Name'] = total_job_posting_df['Company Name'].str.replace('-', ' ')
total_job_posting_df['Company Name'] = total_job_posting_df['Company Name'].str.lower()
print(total_job_posting_df)

                                             Job Title  \
0                           route sales representative   
1                                ultrasound technician   
2                     optometric ophthalmic technician   
3                                           controller   
4                                      account manager   
..                                                 ...   
850                                  warehouse manager   
851                                    delivery driver   
852  executive assistant administrative specialist ...   
853                       nutritionist health educator   
854        access control officer (part time)   savina   

                              Company Name  \
0                  frito lay north america   
1                    proximity diagnostics   
2                 imperial beach optometry   
3                                 filtrous   
4                            aya corporate   
..                         

# Here, we remove the duplicate rows, (ignoring the "Date Posted" b/c the same job can be posted on multiple days)


In [5]:
#Find duplicate job postings
total_job_posting_df.loc[total_job_posting_df.duplicated(subset=['Job Title', 'Company Name', 'Location', 'Remote', 'Salary'])]

Unnamed: 0,Job Title,Company Name,Location,Remote,Salary,Full Time,Part Time,Date Posted
1592,cashier,the cheesecake factory,"San Diego, CA 92108",False,,False,False,2022-01-20
1982,utility and maintenance specialist i ii syste...,city of encinitas,"Encinitas, CA 92024",False,"$5,157 - $7,596 a month",False,False,2022-01-20
2023,meter reader,american water,"Imperial Beach, CA",False,,False,False,2022-01-20
2049,joiner,aerotek,"San Diego, CA",False,$16 - $22 an hour,False,False,2022-01-20
2292,inspection tenchnician,la solar group inc,"San Diego, CA",False,$17 - $24 an hour,False,False,2022-01-20
...,...,...,...,...,...,...,...,...
840,valet attendant $$$$$$$$$$$$ tips,ace parking management sara,"Coronado, CA\n+8 locations",False,From $15 an hour,False,True,2022-01-28
841,speech language pathology assistant (slpa),speech improvement center,"Chula Vista, CA\n+2 locations",False,"$40,000 - $60,000 a year",True,False,2022-01-28
843,diagnostic imaging technical lead virtual hiri...,inova health system,United States,False,,True,False,2022-01-28
847,customer service rep self storage mgr,public storage,"National City, CA\n+7 locations",False,$16 an hour,True,False,2022-01-28


In [6]:
# Remove duplicate job postings
total_job_posting_df.drop_duplicates(subset=['Job Title', 'Company Name', 'Location', 'Remote', 'Salary', 'Full Time', 'Part Time'],inplace=True, keep='last')
print(str(len(total_job_posting_df)) + " unique job postings")

8046 unique job postings


In [7]:
# Save the new cumulative data set
total_job_posting_df.to_csv('../Data/Job_posting_data_cumulative.csv', index=False)


In [8]:
job_salaries = total_job_posting_df['Salary']
print(job_salaries)

0                   $1,179 a week
1          $1,800 - $3,200 a week
2               $10 - $18 an hour
3      $100,000 - $150,000 a year
4      $100,000 - $175,000 a year
                  ...            
850                           NaN
851             $18 - $20 an hour
852                           NaN
853                           NaN
854                   $19 an hour
Name: Salary, Length: 8046, dtype: object


In [9]:
total_job_posting_df['Salary Type'] = total_job_posting_df['Salary'].str.split().str[-1]
display(total_job_posting_df['Salary Type'].unique())

array(['week', 'hour', 'year', 'month', 'day', nan, 'mile', 'session'],
      dtype=object)

In [10]:
# First, remove commas from Salary
total_job_posting_df['Salary'] = total_job_posting_df['Salary'].str.replace(',','')
print(total_job_posting_df)

                                             Job Title  \
0                           route sales representative   
1                                ultrasound technician   
2                     optometric ophthalmic technician   
3                                           controller   
4                                      account manager   
..                                                 ...   
850                                  warehouse manager   
851                                    delivery driver   
852  executive assistant administrative specialist ...   
853                       nutritionist health educator   
854        access control officer (part time)   savina   

                              Company Name  \
0                  frito lay north america   
1                    proximity diagnostics   
2                 imperial beach optometry   
3                                 filtrous   
4                            aya corporate   
..                         

In [11]:
def get_salary_raw_number(row):
    # See if the salary data is a string
    if isinstance(row, str):
        # Find the word "Estimated"
        estimated = row.find("Estimated")
        if estimated != -1:
            # Find the raw number for salary $##.#K
            raw_number = re.search('(\d+(\.\d{1,2})?)K', row).group()
            raw_number = raw_number[:-1]
            raw_number = 1000 * float(raw_number)
        else:
            raw_number = re.search('(\d+)', row).group()
            raw_number = float(raw_number)
    else:
        return None
    return raw_number

In [12]:
# Get salary number to compute yearly salary

total_job_posting_df['Salary (Raw Number)'] = total_job_posting_df['Salary'].apply(lambda x: get_salary_raw_number(x))
print(total_job_posting_df)

                                             Job Title  \
0                           route sales representative   
1                                ultrasound technician   
2                     optometric ophthalmic technician   
3                                           controller   
4                                      account manager   
..                                                 ...   
850                                  warehouse manager   
851                                    delivery driver   
852  executive assistant administrative specialist ...   
853                       nutritionist health educator   
854        access control officer (part time)   savina   

                              Company Name  \
0                  frito lay north america   
1                    proximity diagnostics   
2                 imperial beach optometry   
3                                 filtrous   
4                            aya corporate   
..                         

In [13]:
def compute_yearly_salary(x):
    yearly_salary = None
    if x['Salary Type'] == "hour":
        yearly_salary = 2087 * int(x['Salary (Raw Number)'])
    elif x['Salary Type'] == "day":
        yearly_salary = 261 * int(x['Salary (Raw Number)'])
    elif x['Salary Type'] == "week":
        yearly_salary = 52 * int(x['Salary (Raw Number)'])
    elif x['Salary Type'] == "month":
        yearly_salary = 12 * int(x['Salary (Raw Number)'])
    elif x['Salary Type'] == "year":
        yearly_salary = int(x['Salary (Raw Number)'])
    else:
        yearly_salary = None
    return yearly_salary
        

In [14]:
total_job_posting_df['Annual Salary'] = total_job_posting_df.apply(compute_yearly_salary, axis=1)

# TODO: Format the Annual Salary column to currency
#job_posting_df['Annual Salary'] = job_posting_df['Annual Salary'].format("$*:")
display(total_job_posting_df)
                                                      

Unnamed: 0,Job Title,Company Name,Location,Remote,Salary,Full Time,Part Time,Date Posted,Salary Type,Salary (Raw Number),Annual Salary
0,route sales representative,frito lay north america,"San Diego, CA 92123 \n(Kearny Mesa area)\n+1 l...",False,$1179 a week,True,False,1/7/2022,week,1179.0,61308.0
1,ultrasound technician,proximity diagnostics,"San Diego, CA",False,$1800 - $3200 a week,False,True,1/7/2022,week,1800.0,93600.0
2,optometric ophthalmic technician,imperial beach optometry,"Imperial Beach, CA 91932\n+1 location",False,$10 - $18 an hour,True,False,1/7/2022,hour,10.0,20870.0
3,controller,filtrous,"Poway, CA 92064",False,$100000 - $150000 a year,True,False,1/7/2022,year,100000.0,100000.0
4,account manager,aya corporate,"San Diego, CA\n+18 locations",False,$100000 - $175000 a year,False,False,1/7/2022,year,100000.0,100000.0
...,...,...,...,...,...,...,...,...,...,...,...
850,warehouse manager,behind the scenes catering and events,"San Diego, CA",False,,True,False,2022-01-28,,,
851,delivery driver,le parfait paris,"San Diego, CA 92120",False,$18 - $20 an hour,False,True,2022-01-28,hour,18.0,37566.0
852,executive assistant administrative specialist ...,"woongjin, inc","San Diego, CA",False,,True,False,2022-01-28,,,
853,nutritionist health educator,family health centers of san diego,"San Diego, CA 92102\n(Mt. Hope area)\n+1 location",False,,False,True,2022-01-28,,,


In [15]:
# Drop the unneccessary column
total_job_posting_df.pop('Salary (Raw Number)')

0        1179.0
1        1800.0
2          10.0
3      100000.0
4      100000.0
         ...   
850         NaN
851        18.0
852         NaN
853         NaN
854        19.0
Name: Salary (Raw Number), Length: 8046, dtype: float64

In [16]:
# Save the .csv file
total_job_posting_df.to_csv('../Data/Job_Posting_Data_With_Annual_Salary.csv', index=False)