In [1]:
import pandas as pd

In [2]:
# Show current data set
job_posting_df = pd.read_csv("../Data/Job_posting_data_combined_date_and_relevancy_made_in_Jan_7.csv")

print(job_posting_df)

                                              Job Title  \
0                            Route Sales Representative   
1                                 Ultrasound Technician   
2                      Optometric/Ophthalmic Technician   
3                                            Controller   
4                                       Account Manager   
...                                                 ...   
1900                        Technical Support Scientist   
1901                Delivery Driver - Pharmacy Services   
1902                      Environmental Services Worker   
1903  Reimbursement Specialist I - Reimbursement Ser...   
1904                       Testing Services Coordinator   

                            Company Name  \
0                Frito-Lay North America   
1                  Proximity Diagnostics   
2               Imperial Beach Optometry   
3                               Filtrous   
4                          Aya Corporate   
...                        

# Here, we remove the duplicate rows, (ignoring the "Date Posted" b/c the same job can be posted on multiple days)


In [3]:
#Find duplicate job postings
job_posting_df.loc[job_posting_df.duplicated(subset=['Job Title', 'Company Name', 'Location', 'Remote', 'Salary'])]

Unnamed: 0,Job Title,Company Name,Location,Remote,Salary,Full Time,Part Time,Date Posted
15,Forensic Toxicology Laboratory Manager,County of San Diego,"San Diego, CA \n(Core-Columbia area)",False,"$120,000 - $130,000 a year",True,False,1/7/2022
70,Diaper Bank Support Specialist - Temporary,Jewish Family Service of San Diego,"San Diego, CA 92123 \n(Kearny Mesa area)",False,$16 - $17 an hour,False,False,1/7/2022
71,Diaper Bank Support Specialist - Temporary,Jewish Family Service of San Diego,"San Diego, CA 92123 \n(Kearny Mesa area)",False,$16 - $17 an hour,False,False,1/7/2022
91,Traffic Control Technician,West Coast Traffic Control,"San Diego, CA\n+1 location",False,$16 - $30 an hour,True,False,1/7/2022
106,Data Center Solutions Architect,CyberCoders,"San Diego, CA 92123\n+2 locations",False,"$160,000 - $215,000 a year",True,False,1/7/2022
...,...,...,...,...,...,...,...,...
1865,Specimen Handler / Accessioner Contractor,Helix,"San Diego, CA",False,,False,False,1/7/2022
1866,Specimen Handler / Accessioner Contractor,Helix,"San Diego, CA 92113 \n(Logan Heights area)",False,,False,False,1/7/2022
1869,Sterile Processing Tech,Allcare Nursing Services,"San Diego, CA",False,,False,False,12/8/2021
1870,Program Management Specialist,"INDUS Technology, Inc.","San Diego, CA 92110 \n(Old Town area)",False,,True,False,1/7/2022


In [4]:
job_posting_df.drop_duplicates(subset=['Job Title', 'Company Name', 'Location', 'Remote', 'Salary', 'Full Time', 'Part Time'],inplace=True, keep='first')
print(str(len(job_posting_df)) + " unique job postings")

1575 unique job postings


In [5]:
job_salaries = job_posting_df['Salary']
print(job_salaries)

0                    $1,179 a week
1           $1,800 - $3,200 a week
2                $10 - $18 an hour
3       $100,000 - $150,000 a year
4       $100,000 - $175,000 a year
                   ...            
1900                           NaN
1901                           NaN
1902                           NaN
1903                           NaN
1904                           NaN
Name: Salary, Length: 1575, dtype: object


In [6]:
job_posting_df['Salary Type'] = job_posting_df['Salary'].str.split().str[-1]
display(job_posting_df['Salary Type'].unique())

array(['week', 'hour', 'year', 'month', 'day', nan], dtype=object)

In [7]:
# First, remove commas from Salary
job_posting_df['Salary'] = job_posting_df['Salary'].str.replace(',','')
print(job_posting_df)

                                              Job Title  \
0                            Route Sales Representative   
1                                 Ultrasound Technician   
2                      Optometric/Ophthalmic Technician   
3                                            Controller   
4                                       Account Manager   
...                                                 ...   
1900                        Technical Support Scientist   
1901                Delivery Driver - Pharmacy Services   
1902                      Environmental Services Worker   
1903  Reimbursement Specialist I - Reimbursement Ser...   
1904                       Testing Services Coordinator   

                            Company Name  \
0                Frito-Lay North America   
1                  Proximity Diagnostics   
2               Imperial Beach Optometry   
3                               Filtrous   
4                          Aya Corporate   
...                        

In [8]:
# Get salary number to compute yearly salary

job_posting_df['Salary (Raw Number)'] = job_posting_df['Salary'].str.extract('(\d+)')
print(job_posting_df)

                                              Job Title  \
0                            Route Sales Representative   
1                                 Ultrasound Technician   
2                      Optometric/Ophthalmic Technician   
3                                            Controller   
4                                       Account Manager   
...                                                 ...   
1900                        Technical Support Scientist   
1901                Delivery Driver - Pharmacy Services   
1902                      Environmental Services Worker   
1903  Reimbursement Specialist I - Reimbursement Ser...   
1904                       Testing Services Coordinator   

                            Company Name  \
0                Frito-Lay North America   
1                  Proximity Diagnostics   
2               Imperial Beach Optometry   
3                               Filtrous   
4                          Aya Corporate   
...                        

In [9]:
def compute_yearly_salary(x):
    yearly_salary = None
    if x['Salary Type'] == "hour":
        yearly_salary = 2087 * int(x['Salary (Raw Number)'])
    elif x['Salary Type'] == "day":
        yearly_salary = 261 * int(x['Salary (Raw Number)'])
    elif x['Salary Type'] == "week":
        yearly_salary = 52 * int(x['Salary (Raw Number)'])
    elif x['Salary Type'] == "month":
        yearly_salary = 12 * int(x['Salary (Raw Number)'])
    elif x['Salary Type'] == "year":
        yearly_salary = int(x['Salary (Raw Number)'])
    else:
        yearly_salary = None
    return yearly_salary
        

In [10]:
job_posting_df['Annual Salary'] = job_posting_df.apply(compute_yearly_salary, axis=1)

# TODO: Format the Annual Salary column to currency
#job_posting_df['Annual Salary'] = job_posting_df['Annual Salary'].format("$*:")
display(job_posting_df)
                                                      

Unnamed: 0,Job Title,Company Name,Location,Remote,Salary,Full Time,Part Time,Date Posted,Salary Type,Salary (Raw Number),Annual Salary
0,Route Sales Representative,Frito-Lay North America,"San Diego, CA 92123 \n(Kearny Mesa area)\n+1 l...",False,$1179 a week,True,False,1/7/2022,week,1179,61308.0
1,Ultrasound Technician,Proximity Diagnostics,"San Diego, CA",False,$1800 - $3200 a week,False,True,1/7/2022,week,1800,93600.0
2,Optometric/Ophthalmic Technician,Imperial Beach Optometry,"Imperial Beach, CA 91932\n+1 location",False,$10 - $18 an hour,True,False,1/7/2022,hour,10,20870.0
3,Controller,Filtrous,"Poway, CA 92064",False,$100000 - $150000 a year,True,False,1/7/2022,year,100000,100000.0
4,Account Manager,Aya Corporate,"San Diego, CA\n+18 locations",False,$100000 - $175000 a year,False,False,1/7/2022,year,100000,100000.0
...,...,...,...,...,...,...,...,...,...,...,...
1900,Technical Support Scientist,bioMérieux,"San Diego, CA",False,,True,False,1/4/2022,,,
1901,Delivery Driver - Pharmacy Services,Cardinal Health,"San Diego, CA 92108",False,,True,False,12/30/2021,,,
1902,Environmental Services Worker,Sharp Healthcare,"San Diego, CA",False,,False,False,12/29/2021,,,
1903,Reimbursement Specialist I - Reimbursement Ser...,Rady Children's Hospital-San Diego,"San Diego, CA\n+4 locations",False,,True,False,12/31/2021,,,


In [11]:
# Drop the unneccessary column
job_posting_df.pop('Salary (Raw Number)')

0         1179
1         1800
2           10
3       100000
4       100000
         ...  
1900       NaN
1901       NaN
1902       NaN
1903       NaN
1904       NaN
Name: Salary (Raw Number), Length: 1575, dtype: object

In [12]:
# Save the .csv file
job_posting_df.to_csv('../Data/Job_Posting_Data_With_Annual_Salary.csv', index=False)