Importing pandas and setting the option to see all the rows

In [204]:
import numpy as np
from sklearn.impute import KNNImputer
import pandas as pd
pd.set_option('display.max_rows', None)

In [205]:
df = pd.read_csv('data_scientist_jobs.csv')
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,DATA SCIENTIST,$54K-$92K\n(Glassdoor est.),JOB SUMMARY\n\nThe position will focus on buil...,3.8,Blessing Hospital\n3.8,"Quincy, IL",-1,1001 to 5000 Employees,1875,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD),-1
1,Sr. Data Scientist - Model Governance,$101K-$163K\n(Glassdoor est.),General Summary:\n\nThe position plays a criti...,3.6,Elevate Credit\n3.6,"Addison, TX",-1,501 to 1000 Employees,2014,Company - Public,Lending,Finance,$100 to $500 million (USD),-1
2,Digital Pathology Scientist,-1,Digital Pathology Scientist *\nDescription*\nM...,3.5,Mosaic Laboratories\n3.5,"Lake Forest, CA",-1,1 to 50 Employees,-1,Company - Private,-1,-1,$1 to $5 million (USD),-1
3,Semantic Data Modeler and Engineer,-1,Semantic Data Modeler and Engineer\n\nTucson E...,2.8,"Tucson Embedded Systems, Inc.\n2.8",Alabama,-1,51 to 200 Employees,1997,Company - Private,Enterprise Software & Network Solutions,Information Technology,$10 to $25 million (USD),-1
4,R&D Scientist,$31K-$72K\n(Glassdoor est.),Our Research and Development department is on ...,4.3,Chobani\n4.3,"Twin Falls, ID",-1,1001 to 5000 Employees,2005,Company - Private,Food & Beverage Manufacturing,Manufacturing,$1 to $2 billion (USD),-1


## Removing the rows that don't have a salary estimate

In [206]:
df = df[df['Salary Estimate'] != '-1']
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,DATA SCIENTIST,$54K-$92K\n(Glassdoor est.),JOB SUMMARY\n\nThe position will focus on buil...,3.8,Blessing Hospital\n3.8,"Quincy, IL",-1,1001 to 5000 Employees,1875,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD),-1
1,Sr. Data Scientist - Model Governance,$101K-$163K\n(Glassdoor est.),General Summary:\n\nThe position plays a criti...,3.6,Elevate Credit\n3.6,"Addison, TX",-1,501 to 1000 Employees,2014,Company - Public,Lending,Finance,$100 to $500 million (USD),-1
4,R&D Scientist,$31K-$72K\n(Glassdoor est.),Our Research and Development department is on ...,4.3,Chobani\n4.3,"Twin Falls, ID",-1,1001 to 5000 Employees,2005,Company - Private,Food & Beverage Manufacturing,Manufacturing,$1 to $2 billion (USD),-1
6,Manufacturing Scientist,$41K-$91K\n(Glassdoor est.),Biomerica Corporation focusing on the research...,2.5,Biomerica\n2.5,"Irvine, CA",-1,1 to 50 Employees,-1,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$5 to $10 million (USD),-1
8,HCM Data Analyst,$55K-$94K\n(Glassdoor est.),Any qualified individual with a disability who...,3.2,Austal USA\n3.2,"Mobile, AL",-1,1001 to 5000 Employees,1988,Company - Public,Industrial Manufacturing,Manufacturing,$1 to $2 billion (USD),-1


## Removing the term "Glassdoor estimate" from the salary estimate

In [207]:
salary = df['Salary Estimate'].apply(lambda x: x.split('(')[0])
salary.head()

0      $54K-$92K\n
1    $101K-$163K\n
4      $31K-$72K\n
6      $41K-$91K\n
8      $55K-$94K\n
Name: Salary Estimate, dtype: object

## Removing the dollar signs and K

In [208]:
minus_kd = salary.apply(lambda x: x.replace('K', '').replace('$',''))

## Adding one hot encoding for hourly rates and employer provided salary. Also getting the salary ranges by removing all kinds of text

In [209]:
df['hourly'] = df['Salary Estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
df['employer_provided'] = df['Salary Estimate'].apply(lambda x: 1 if 'employer provided salary:' in x.lower() else 0)
range_ = minus_kd.apply(lambda x: x.lower().replace('per hour', '').replace('employer provided salary:', ''))
print(range_)

0        54-92\n
1      101-163\n
4        31-72\n
6        41-91\n
8        55-94\n
9       77-123\n
10      46-105\n
11       55-70\n
12     117-134\n
13       34-62\n
14     155-247\n
15      68-110\n
17       70-95\n
18      73-124\n
19     141-225\n
20       43-72\n
21      67-117\n
22      86-142\n
25       42-79\n
26      99-110\n
27       48-79\n
28       39-69\n
29     111-176\n
30      75-125\n
31      80-134\n
33       58-95\n
35     125-198\n
37       12-92\n
38      99-182\n
39      63-104\n
40     111-176\n
42       34-64\n
43      81-134\n
45      93-149\n
46       53-91\n
47       48-98\n
48      92-151\n
49      68-135\n
51       38-75\n
53       33-59\n
54      91-150\n
55       55-98\n
56       57-97\n
57       29-53\n
59       24-51\n
60     145-163\n
61      81-169\n
62      73-124\n
63     111-143\n
65      20-28 \n
67      59-123\n
68       48-83\n
69       46-94\n
70       48-84\n
71      85-139\n
73      80-134\n
75      80-111\n
77       56-97\n
78     123-196

Adding min, max and average salary columns from the salary ranges extracted above

In [210]:
df['min_salary'] = range_.apply(lambda x: int(x.split('-')[0]))
df['max_salary'] = range_.apply(lambda x: int(x.split('-')[1]))
df['avg_salary'] = (df.min_salary + df.max_salary)/2
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,hourly,employer_provided,min_salary,max_salary,avg_salary
0,DATA SCIENTIST,$54K-$92K\n(Glassdoor est.),JOB SUMMARY\n\nThe position will focus on buil...,3.8,Blessing Hospital\n3.8,"Quincy, IL",-1,1001 to 5000 Employees,1875,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD),-1,0,0,54,92,73.0
1,Sr. Data Scientist - Model Governance,$101K-$163K\n(Glassdoor est.),General Summary:\n\nThe position plays a criti...,3.6,Elevate Credit\n3.6,"Addison, TX",-1,501 to 1000 Employees,2014,Company - Public,Lending,Finance,$100 to $500 million (USD),-1,0,0,101,163,132.0
4,R&D Scientist,$31K-$72K\n(Glassdoor est.),Our Research and Development department is on ...,4.3,Chobani\n4.3,"Twin Falls, ID",-1,1001 to 5000 Employees,2005,Company - Private,Food & Beverage Manufacturing,Manufacturing,$1 to $2 billion (USD),-1,0,0,31,72,51.5
6,Manufacturing Scientist,$41K-$91K\n(Glassdoor est.),Biomerica Corporation focusing on the research...,2.5,Biomerica\n2.5,"Irvine, CA",-1,1 to 50 Employees,-1,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$5 to $10 million (USD),-1,0,0,41,91,66.0
8,HCM Data Analyst,$55K-$94K\n(Glassdoor est.),Any qualified individual with a disability who...,3.2,Austal USA\n3.2,"Mobile, AL",-1,1001 to 5000 Employees,1988,Company - Public,Industrial Manufacturing,Manufacturing,$1 to $2 billion (USD),-1,0,0,55,94,74.5


Adding a column just for company name while separating the name from the rating 

In [211]:
df['company_txt'] = df.apply(lambda x: x['Company Name'] if x['Rating'] <0 else x['Company Name'][:-4], axis = 1)
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,hourly,employer_provided,min_salary,max_salary,avg_salary,company_txt
0,DATA SCIENTIST,$54K-$92K\n(Glassdoor est.),JOB SUMMARY\n\nThe position will focus on buil...,3.8,Blessing Hospital\n3.8,"Quincy, IL",-1,1001 to 5000 Employees,1875,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD),-1,0,0,54,92,73.0,Blessing Hospital
1,Sr. Data Scientist - Model Governance,$101K-$163K\n(Glassdoor est.),General Summary:\n\nThe position plays a criti...,3.6,Elevate Credit\n3.6,"Addison, TX",-1,501 to 1000 Employees,2014,Company - Public,Lending,Finance,$100 to $500 million (USD),-1,0,0,101,163,132.0,Elevate Credit
4,R&D Scientist,$31K-$72K\n(Glassdoor est.),Our Research and Development department is on ...,4.3,Chobani\n4.3,"Twin Falls, ID",-1,1001 to 5000 Employees,2005,Company - Private,Food & Beverage Manufacturing,Manufacturing,$1 to $2 billion (USD),-1,0,0,31,72,51.5,Chobani
6,Manufacturing Scientist,$41K-$91K\n(Glassdoor est.),Biomerica Corporation focusing on the research...,2.5,Biomerica\n2.5,"Irvine, CA",-1,1 to 50 Employees,-1,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$5 to $10 million (USD),-1,0,0,41,91,66.0,Biomerica
8,HCM Data Analyst,$55K-$94K\n(Glassdoor est.),Any qualified individual with a disability who...,3.2,Austal USA\n3.2,"Mobile, AL",-1,1001 to 5000 Employees,1988,Company - Public,Industrial Manufacturing,Manufacturing,$1 to $2 billion (USD),-1,0,0,55,94,74.5,Austal USA


Adding a seprate column just for the state of the location of the job

In [212]:
df['job_state'] = df['Location'].apply(lambda x: x.split(',')[1])
#print(df.job_state)

## Adding a column with the age of the company

In [213]:
df['age'] = df.Founded.apply(lambda x: x if x <1 else 2020 - x)
df.age.value_counts()

-1      49
 8      26
 24     25
 16     24
 62     24
 9      23
 15     22
 26     22
 7      20
 10     18
 21     18
 6      16
 13     16
 169    13
 11     13
 12     13
 14     13
 51     13
 37     13
 44     11
 20     11
 23     11
 22     11
 18     11
 107    11
 17     11
 27     10
 43      9
 60      8
 33      7
 31      7
 39      7
 41      7
 48      7
 81      7
 84      6
 25      6
 53      6
 28      6
 32      6
 19      6
 4       6
 3       6
 133     6
 102     6
 109     5
 47      5
 77      5
 74      5
 68      5
 120     5
 46      4
 29      4
 49      4
 55      4
 35      4
 52      4
 5       4
 106     4
 97      4
 69      4
 78      4
 75      4
 214     3
 36      3
 136     3
 94      3
 115     3
 114     3
 158     3
 121     3
 50      3
 54      3
 67      3
 119     3
 64      3
 58      3
 112     3
 126     2
 167     2
 118     2
 145     2
 30      2
 122     2
 45      2
 116     2
 73      2
 105     2
 1       2
 85      2
 82      2

## Extracting skills from the job descriptions and adding one hot encoding for each type of skill. 1st one is for python

In [214]:
df['python_'] = df['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)
df.python_.value_counts()

1    493
0    277
Name: python_, dtype: int64

OHE for R

In [215]:
df['R_'] = df['Job Description'].apply(lambda x: 1 if 'r studio' in x.lower() or 'r-studio' in x.lower() else 0)
df.R_.value_counts()

0    763
1      7
Name: R_, dtype: int64

OHE for spark

In [216]:
df['spark_'] = df['Job Description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)
df.spark_.value_counts()

0    599
1    171
Name: spark_, dtype: int64

OHE for aws

In [217]:
df['aws_'] = df['Job Description'].apply(lambda x: 1 if 'aws' in x.lower() else 0)
df.aws_.value_counts()

0    576
1    194
Name: aws_, dtype: int64

OHE for excel

In [218]:
df['excel_'] = df['Job Description'].apply(lambda x: 1 if 'excel' in x.lower() else 0)
df.excel_.value_counts()

1    390
0    380
Name: excel_, dtype: int64

## OHE for the type of ownership. Private is coded as 1 while rest is 0

In [219]:
df['private'] = df['Type of ownership'].apply(lambda x: 1 if 'private' in x.lower() else 0)
df.private.value_counts()

0    392
1    378
Name: private, dtype: int64

## Filling in data for Rating by np.nan for knn-imputation

In [220]:
df['Rating'] = df['Rating'].apply(lambda x: np.nan if x==-1 else x)

## Doing the same for age

In [221]:
df['age'] = df['age'].apply(lambda x: np.nan if x==-1 else x)

## Converting the size column to numreic data by getting the average size of the companies

In [222]:
df.Size.value_counts()

10000+ Employees           183
1001 to 5000 Employees     174
201 to 500 Employees       115
51 to 200 Employees        113
501 to 1000 Employees       99
5001 to 10000 Employees     60
1 to 50 Employees           19
Unknown                      4
-1                           3
Name: Size, dtype: int64

In [223]:
df['Size'] = df['Size'].apply(lambda x: -1 if x == "Unknown" else x)
df['Size'] = df['Size'].apply(lambda x: -1 if x== "-1" else x)
df.Size.value_counts()

10000+ Employees           183
1001 to 5000 Employees     174
201 to 500 Employees       115
51 to 200 Employees        113
501 to 1000 Employees       99
5001 to 10000 Employees     60
1 to 50 Employees           19
-1                           7
Name: Size, dtype: int64

In [224]:
size = df['Size']
size_max = []
size_min = []

for s in size:
    if s == -1:
        size_max.append(s)
        size_min.append(s)
    else:
        s = str(s)
        if s == "10000+ Employees":
            size_max.append(10000)
            size_min.append(10000)
        else:
            #print(s)
            size_min.append(float(s.split(" ")[0]))
            size_max.append(float(s.split(" ")[2]))

avg_size = []
for i in range(len(size_min)):
    avg_size.append((size_min[i] + size_max[i])/2)
    
#print(avg_size)
df['avg_size'] = avg_size

In [225]:
df['avg_size'] = df['avg_size'].apply(lambda x: np.nan if x==-1 else x)

In [226]:
df['Revenue'] = df['Revenue'].apply(lambda x: -1 if x[:7] == "Unknown" else x)
#print((df['Size'][0].split('E')[0]).split(' ')[2])

## Doing the same thing with revenues

In [227]:
df['Revenue'] = df['Revenue'].apply(lambda x: -1 if x == "-1" else x)
df.Revenue.value_counts()

-1                                  197
$10+ billion (USD)                  140
$100 to $500 million (USD)          110
$1 to $2 billion (USD)               53
$2 to $5 billion (USD)               50
$25 to $50 million (USD)             42
$10 to $25 million (USD)             40
$50 to $100 million (USD)            37
$500 million to $1 billion (USD)     37
$5 to $10 billion (USD)              23
Less than $1 million (USD)           18
$5 to $10 million (USD)              15
$1 to $5 million (USD)                8
Name: Revenue, dtype: int64

In [230]:
revenue = df['Revenue']
revenue_min = []
revenue_max = []
#revenue_max = df['Revenue'].apply(lambda x: x if x ==(-1 or "$10+ billion (USD)") else x.split(" ")[2][1:])
for r in revenue:
    if r == -1:
        revenue_min.append(r)
        revenue_max.append(r)
    elif r == "$10+ billion (USD)":
        revenue_min.append(10000000000)
        revenue_max.append(10000000000)
    elif r == "$500 million to $1 billion (USD)":
        revenue_min.append(500000000)
        revenue_max.append(1000000000)
    elif r == "Less than $1 million (USD)":
        revenue_min.append(900000)
        revenue_max.append(900000)
    else:
        fact = r.split(" ")[3][0]
        if fact == "m":
            factor = 1000000
        else:
            factor = 1000000000
        revenue_min.append(float(r.split(" ")[0][1:])*factor)
        revenue_max.append(float(r.split(" ")[2][1:])*factor)

avg_revenue = []
for i in range(len(revenue_min)):
    avg_revenue.append((revenue_min[i] + revenue_max[i])/2)
    
#df['avg_revenue'] = avg_revenue
print(len(avg_revenue))

770


In [232]:
df['avg_revenue'] = avg_revenue
df['avg_revenue'] = df['avg_revenue'].apply(lambda x: np.nan if x==-1 else x)

In [233]:
imputer = KNNImputer(n_neighbors=2)
continous_df = df[['avg_revenue', 'avg_size', 'age', 'Rating']]
continous_df = pd.DataFrame(imputer.fit_transform(continous_df), columns = continous_df.columns)

# Modifying the data frame with all the imputed columns

In [234]:
df.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary',
       'company_txt', 'job_state', 'age', 'python_', 'R_', 'spark_', 'aws_',
       'excel_', 'private', 'avg_size', 'avg_revenue'],
      dtype='object')

In [235]:
df.drop(['avg_revenue', 'avg_size', 'age', 'Rating'], axis = 1)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,...,max_salary,avg_salary,company_txt,job_state,python_,R_,spark_,aws_,excel_,private
0,DATA SCIENTIST,$54K-$92K\n(Glassdoor est.),JOB SUMMARY\n\nThe position will focus on buil...,Blessing Hospital\n3.8,"Quincy, IL",-1,1001 to 5000 Employees,1875,Nonprofit Organization,Health Care Services & Hospitals,...,92,73.0,Blessing Hospital,IL,1,0,0,1,1,0
1,Sr. Data Scientist - Model Governance,$101K-$163K\n(Glassdoor est.),General Summary:\n\nThe position plays a criti...,Elevate Credit\n3.6,"Addison, TX",-1,501 to 1000 Employees,2014,Company - Public,Lending,...,163,132.0,Elevate Credit,TX,1,0,0,0,0,0
4,R&D Scientist,$31K-$72K\n(Glassdoor est.),Our Research and Development department is on ...,Chobani\n4.3,"Twin Falls, ID",-1,1001 to 5000 Employees,2005,Company - Private,Food & Beverage Manufacturing,...,72,51.5,Chobani,ID,0,0,0,1,1,1
6,Manufacturing Scientist,$41K-$91K\n(Glassdoor est.),Biomerica Corporation focusing on the research...,Biomerica\n2.5,"Irvine, CA",-1,1 to 50 Employees,-1,Company - Public,Biotech & Pharmaceuticals,...,91,66.0,Biomerica,CA,0,0,0,0,1,0
8,HCM Data Analyst,$55K-$94K\n(Glassdoor est.),Any qualified individual with a disability who...,Austal USA\n3.2,"Mobile, AL",-1,1001 to 5000 Employees,1988,Company - Public,Industrial Manufacturing,...,94,74.5,Austal USA,AL,0,0,0,0,1,0
9,"Sr. Director, Office of Data Science",$77K-$123K\n(Glassdoor est.),Advance your career at Liberty Mutual Insuranc...,Liberty Mutual Insurance\n3.5,"Warrenville, IL",-1,10000+ Employees,1912,Company - Private,Insurance Carriers,...,123,100.0,Liberty Mutual Insurance,IL,0,0,0,0,1,1
10,Scientist,$46K-$105K\n(Glassdoor est.),Why Join Revenue Analytics?\nBecause you have ...,Revenue Analytics\n4.1,"Atlanta, GA",-1,51 to 200 Employees,2005,Company - Private,Enterprise Software & Network Solutions,...,105,75.5,Revenue Analytics,GA,1,0,0,0,0,1
11,Data Analyst,$55K-$70K\n(Glassdoor est.),PerBlue is looking for an aspiring Data Analys...,PerBlue\n4.3,"Madison, WI",-1,51 to 200 Employees,2010,Company - Private,Video Games,...,70,62.5,PerBlue,WI,1,0,0,0,0,1
12,Senior Genomic Data Scientist,$117K-$134K\n(Glassdoor est.),GeneDx is seeking a Senior Genomic Data Scient...,GeneDx\n2.8,"Gaithersburg, MD",-1,201 to 500 Employees,-1,Company - Public,Biotech & Pharmaceuticals,...,134,125.5,GeneDx,MD,1,0,0,1,0,0
13,Data Analyst,$34K-$62K\n(Glassdoor est.),Breckenridge Grand Vacations (BGV) is seeking ...,Breckenridge Grand Vacations\n4.3,"Breckenridge, CO",-1,501 to 1000 Employees,1984,Company - Private,"Hotels, Motels, & Resorts",...,62,48.0,Breckenridge Grand Vacations,CO,1,0,0,0,1,1


In [236]:
final_df = pd.concat([df, continous_df], axis=1, join='inner')

In [237]:
final_df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,spark_,aws_,excel_,private,avg_size,avg_revenue,avg_revenue.1,avg_size.1,age,Rating.1
0,DATA SCIENTIST,$54K-$92K\n(Glassdoor est.),JOB SUMMARY\n\nThe position will focus on buil...,3.8,Blessing Hospital\n3.8,"Quincy, IL",-1,1001 to 5000 Employees,1875,Nonprofit Organization,...,0,1,1,0,3000.5,300000000.0,300000000.0,3000.5,145.0,3.8
1,Sr. Data Scientist - Model Governance,$101K-$163K\n(Glassdoor est.),General Summary:\n\nThe position plays a criti...,3.6,Elevate Credit\n3.6,"Addison, TX",-1,501 to 1000 Employees,2014,Company - Public,...,0,0,0,0,750.5,300000000.0,300000000.0,750.5,6.0,3.6
4,R&D Scientist,$31K-$72K\n(Glassdoor est.),Our Research and Development department is on ...,4.3,Chobani\n4.3,"Twin Falls, ID",-1,1001 to 5000 Employees,2005,Company - Private,...,0,1,1,1,3000.5,1500000000.0,1500000000.0,3000.5,32.0,3.2
6,Manufacturing Scientist,$41K-$91K\n(Glassdoor est.),Biomerica Corporation focusing on the research...,2.5,Biomerica\n2.5,"Irvine, CA",-1,1 to 50 Employees,-1,Company - Public,...,0,0,1,0,25.5,7500000.0,37500000.0,125.5,15.0,4.1
8,HCM Data Analyst,$55K-$94K\n(Glassdoor est.),Any qualified individual with a disability who...,3.2,Austal USA\n3.2,"Mobile, AL",-1,1001 to 5000 Employees,1988,Company - Public,...,0,0,1,0,3000.5,1500000000.0,17500000.0,350.5,22.0,2.8


## Exporting the data frame as a csv for further exploratory data analysis

In [238]:
df.to_csv('salary_data_cleaned.csv',index = False)