In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

df  = pd.read_csv('DSresults_May.csv')

In [6]:
#1.Remove duplicates
print('before removing duplicates: ', len(df))
df = df.drop_duplicates()
print('after removing duplicates: ', len(df))

before removing duplicates:  270
after removing duplicates:  224


In [7]:
# 2.Only include data scientist title  

print('before removing items in JobTitle: ', len(df))
df = df[df['JobTitle'].str.contains('Data|Scientist|Sr.|Jr.|-')]
print('after removing items in JobTitle: ', len(df))

before removing items in JobTitle:  224
after removing items in JobTitle:  218


In [8]:
# 3. Get levels from job title, job summary. 
#    Entry level: 0-3 years, Mid level: 3-5 years, Senior level: above 5 years
def get_level(x):
    title = x['JobTitle']
    if 'Senior' in title or 'Sr' in title:
        return 'Senior'
    elif 'Junior' in title or 'Jr' in title:
        return 'Junior'
    elif 'Entry Level' in title:
        return 'Entry Level'
    else:
        nums = re.findall('(\d)\+?\syears', x['Summary'])
        if not nums:
            return 'Entry Level'
        num = np.sum([int(n) for n in nums])
        if num <= 3:
            return 'Entry Level'
        elif num > 3 and num <= 5:
            return 'Mid Level'
        return 'Senior'
        
df['Level'] = df.apply(get_level, axis=1)

In [9]:
# 4.Find out remote availability for each job

def get_availability(x):
    if 'Remote' in x['Location'] or 'remote' in x['Location']:
        return 'Yes'
    else:
        return 'No'
    
df['Remote Availability'] = df.apply(get_availability, axis=1)

In [10]:
# 5. Get state name from jobs. If there's no state name, it is a remote job. 

def get_location(x):
    try:
        return re.findall('[A-Z]{2}', x['Location'])[0]
    except:
        if 'California' in x['Location']:
            return 'CA'
        elif 'Arizona' in x['Location']:
            return 'AZ'
        return 'Remote'

df['location'] = df.apply(get_location, axis=1)
#df[df['State']==None]
df.head(5)

Unnamed: 0,JobTitle,Company,Location,PostDate,ExtractDate,Salary,Summary,Level,Remote Availability,location
0,Data Scientist I,iQor,Remote,EmployerActive 2 days ago,5/30/2022,"$70,000 a year",Present information using data visualization t...,Entry Level,Yes,Remote
1,Data Scientist 2,The Coca-Cola Company,"Plano, TX 75024+1 location",Posted2 days ago,5/30/2022,Not Available,You will play a critical role in the growing D...,Entry Level,No,TX
2,Junior Data Scientist,Talentheed Inc,Remote,EmployerActive 3 days ago,5/30/2022,"$57,145 - $119,045 a year",Present information using data visualization t...,Junior,Yes,Remote
3,Data Scientist - (Remote),Roche,"Remote in Indianapolis, IN+1 location",Posted18 days ago,5/30/2022,"$90,000 - $135,000 a year","As a Data Scientist, you will position data sc...",Entry Level,Yes,IN
4,Data Scientist,PROLIM Corporation,"Redmond, WA 98052 (Rose Hill area)",EmployerActive 2 days ago,5/30/2022,From $90 an hour,Minimum of 3 years of experience using a varie...,Entry Level,No,WA


In [11]:
# 6. Get post date: ExtractDate – PostDays = Post date
# For example: 5/31/2022 is extract date, the job has been posted for 30 days, the post date will be 5/1/2022
import re
def function(x):
    if x=='PostedToday' or x=='PostedJust posted' or x=='Hiring ongoing':
        return 0
    else:
        return int(re.sub("\D","",x))
df['Pose_date'] = df['PostDate'].apply(lambda x:function(x))
def func1(x, y):
    return pd.to_datetime(y, format='%m/%d/%Y') - pd.Timedelta(days=x)
df['Pose_date'] = df.apply(lambda x:func1(x.Pose_date, x.ExtractDate), axis=1)
import datetime
#df.drop(['PoseDate'], axis=1,inplace=True)
df['Pose_date'] = df['Pose_date'].apply(lambda x: datetime.datetime.strftime(x,"%m/%d/%Y"))
df.head(5)

Unnamed: 0,JobTitle,Company,Location,PostDate,ExtractDate,Salary,Summary,Level,Remote Availability,location,Pose_date
0,Data Scientist I,iQor,Remote,EmployerActive 2 days ago,5/30/2022,"$70,000 a year",Present information using data visualization t...,Entry Level,Yes,Remote,05/28/2022
1,Data Scientist 2,The Coca-Cola Company,"Plano, TX 75024+1 location",Posted2 days ago,5/30/2022,Not Available,You will play a critical role in the growing D...,Entry Level,No,TX,05/28/2022
2,Junior Data Scientist,Talentheed Inc,Remote,EmployerActive 3 days ago,5/30/2022,"$57,145 - $119,045 a year",Present information using data visualization t...,Junior,Yes,Remote,05/27/2022
3,Data Scientist - (Remote),Roche,"Remote in Indianapolis, IN+1 location",Posted18 days ago,5/30/2022,"$90,000 - $135,000 a year","As a Data Scientist, you will position data sc...",Entry Level,Yes,IN,05/12/2022
4,Data Scientist,PROLIM Corporation,"Redmond, WA 98052 (Rose Hill area)",EmployerActive 2 days ago,5/30/2022,From $90 an hour,Minimum of 3 years of experience using a varie...,Entry Level,No,WA,05/28/2022


In [12]:
# 7.Create a new column to analyze salary. Depends on hourly pay, annualy pay and monthly pay. 
def func2(x):
    if 'hour' in x:
        return 'Hourly Pay'
    elif ('annualy' in x) or ('year' in x):
        return 'Annualy Pay'
    elif 'month' in x:
        return 'Monthly Pay'
df['TypeOfSalary'] = df['Salary'].apply(func2)
df['TypeOfSalary'].value_counts(dropna=False)

NaN            124
Annualy Pay     70
Hourly Pay      21
Monthly Pay      3
Name: TypeOfSalary, dtype: int64

In [13]:
# 7.b.Extract the range of salary from string, create three columns: maximum pay, minium pay, medium pay.

def func3(x, y):
    if y=='Hourly Pay':
        index = 40*52
    elif y=='Monthly Pay':
        index = 12
    else:
        index = 1
    x = x.replace(',','')
    if 'Not Ava' in x:
        return x, x, x
    elif '-' in x:
        return index*int(re.sub("\D","",x.split('-')[0])), index*int(re.sub("\D","",x.split('-')[1])), index*1/2*(int(re.sub("\D","",x.split('-')[0]))+int(re.sub("\D","",x.split('-')[1])))
    elif 'up to' in x:
        return 'Not Available', index*int(re.sub("\D","",x.split('-')[0])), 'Not Available'
    else:
        return 'Not Available', 'Not Available', index*int(re.sub("\D","",x))
#for i in df['Salary']:
#    print(i)
#    print(func4(i))
df[['Minimum_Pay','Maximum_Pay', 'Median_Pay']] = df.apply(lambda x:func3(x.Salary, x.TypeOfSalary),axis=1, result_type="expand")
df.head(5)

Unnamed: 0,JobTitle,Company,Location,PostDate,ExtractDate,Salary,Summary,Level,Remote Availability,location,Pose_date,TypeOfSalary,Minimum_Pay,Maximum_Pay,Median_Pay
0,Data Scientist I,iQor,Remote,EmployerActive 2 days ago,5/30/2022,"$70,000 a year",Present information using data visualization t...,Entry Level,Yes,Remote,05/28/2022,Annualy Pay,Not Available,Not Available,70000
1,Data Scientist 2,The Coca-Cola Company,"Plano, TX 75024+1 location",Posted2 days ago,5/30/2022,Not Available,You will play a critical role in the growing D...,Entry Level,No,TX,05/28/2022,,Not Available,Not Available,Not Available
2,Junior Data Scientist,Talentheed Inc,Remote,EmployerActive 3 days ago,5/30/2022,"$57,145 - $119,045 a year",Present information using data visualization t...,Junior,Yes,Remote,05/27/2022,Annualy Pay,57145,119045,88095
3,Data Scientist - (Remote),Roche,"Remote in Indianapolis, IN+1 location",Posted18 days ago,5/30/2022,"$90,000 - $135,000 a year","As a Data Scientist, you will position data sc...",Entry Level,Yes,IN,05/12/2022,Annualy Pay,90000,135000,112500
4,Data Scientist,PROLIM Corporation,"Redmond, WA 98052 (Rose Hill area)",EmployerActive 2 days ago,5/30/2022,From $90 an hour,Minimum of 3 years of experience using a varie...,Entry Level,No,WA,05/28/2022,Hourly Pay,Not Available,Not Available,187200


In [14]:
# 8.Clean the string, delete non-string value in job summary. 

def func5(x):
    x_list = x.split(' ')# first split the words by blankspace
    for xx in range(len(x_list)):
        x_list[xx] = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","",x_list[xx])
    return ' '.join(x_list)
df['Summary'] = df['Summary'].apply(func5)
df.head(5)

Unnamed: 0,JobTitle,Company,Location,PostDate,ExtractDate,Salary,Summary,Level,Remote Availability,location,Pose_date,TypeOfSalary,Minimum_Pay,Maximum_Pay,Median_Pay
0,Data Scientist I,iQor,Remote,EmployerActive 2 days ago,5/30/2022,"$70,000 a year",Present information using data visualization t...,Entry Level,Yes,Remote,05/28/2022,Annualy Pay,Not Available,Not Available,70000
1,Data Scientist 2,The Coca-Cola Company,"Plano, TX 75024+1 location",Posted2 days ago,5/30/2022,Not Available,You will play a critical role in the growing D...,Entry Level,No,TX,05/28/2022,,Not Available,Not Available,Not Available
2,Junior Data Scientist,Talentheed Inc,Remote,EmployerActive 3 days ago,5/30/2022,"$57,145 - $119,045 a year",Present information using data visualization t...,Junior,Yes,Remote,05/27/2022,Annualy Pay,57145,119045,88095
3,Data Scientist - (Remote),Roche,"Remote in Indianapolis, IN+1 location",Posted18 days ago,5/30/2022,"$90,000 - $135,000 a year",As a Data Scientist you will position data sci...,Entry Level,Yes,IN,05/12/2022,Annualy Pay,90000,135000,112500
4,Data Scientist,PROLIM Corporation,"Redmond, WA 98052 (Rose Hill area)",EmployerActive 2 days ago,5/30/2022,From $90 an hour,Minimum of 3 years of experience using a varie...,Entry Level,No,WA,05/28/2022,Hourly Pay,Not Available,Not Available,187200


In [15]:
df.to_csv('DS_May_Cleaned.csv', index=False)