# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('glassdoor_jobs.csv')

In [3]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,Business Systems Analyst,$39K-$77K (Glassdoor est.),***This position is eligible for 100% work fro...,3.2,Healthesystems\n3.2,United States,"Tampa, FL",201 to 500 employees,2002,Company - Private,IT Services,Information Technology,$100 to $500 million (USD),"PMSI, myMatrixx"
1,Business Systems Analyst,$39K-$77K (Glassdoor est.),Client is seeking Business Systems analysts to...,4.5,Peterson Technology Partners\n4.5,"Northbrook, IL","Park Ridge, IL",201 to 500 employees,1997,Contract,Computer Hardware & Software,Information Technology,$25 to $50 million (USD),-1
2,Business Analyst,$39K-$77K (Glassdoor est.),Are you ready to be part of a mission that is ...,4.0,Pushpay\n4.0,"Redmond, WA","Redmond, WA",201 to 500 employees,2011,Company - Public,Computer Hardware & Software,Information Technology,Unknown / Non-Applicable,"Subsplash, Ministry Brands, Smartsheet"
3,Strategy & Analytics Analyst,$39K-$77K (Glassdoor est.),"Job Summary\nResponsible for researching, anal...",2.4,CSX\n2.4,"Jacksonville, FL","Jacksonville, FL",10000+ employees,1978,Company - Public,Transportation Management,Transportation & Logistics,$10+ billion (USD),"Union Pacific, BNSF Railway"
4,Business Systems Analyst,$39K-$77K (Glassdoor est.),MBC is a fast-growing firm that is seeking Bus...,3.7,Morgan Borszcz Consulting\n3.7,"Arlington, VA","Arlington, VA",51 to 200 employees,2003,Company - Private,Consulting,Business Services,$10 to $25 million (USD),-1


In [4]:
#Remove Salary Estimate = -1
df =df[ df['Salary Estimate'] != '-1']

### To do
- Salary Parsing
- Company name text only
- Location
- Age of company
- job desc parsing

## 1. Parsing Salary

In [5]:
salary = df['Salary Estimate'].apply(lambda x: x.split('(')[0])

#remove K and $ sign
salary = salary.apply(lambda x: x.replace('K','').replace('$',''))
salary.unique()

array(['39-77 ', '70-85', '76-90 ', '38-72 ', '32-64 ', '41-80 ',
       '34-67 ', '49-91 ', '46-87 ', '59-78 ', '51-94 ', '55-74 ',
       '62-77 ', '53-95 ', '61-63 ', '47-88 ', '61-99 ', '74-85 ',
       '48-89 ', '64-122 ', '42-68 ', '53-106 ', '46-84 ', '80-102 ',
       '50-78 ', '72-128 ', '67-68 ', '69-114 ', '61-103 '], dtype=object)

In [6]:
df['min_salary'] = salary.apply(lambda x: int(x.split('-')[0]))
df['max_salary'] = salary.apply(lambda x: int(x.split('-')[1]))
df['avg_salary'] = (df['min_salary']+df['max_salary'])/2
df.head(1)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary
0,Business Systems Analyst,$39K-$77K (Glassdoor est.),***This position is eligible for 100% work fro...,3.2,Healthesystems\n3.2,United States,"Tampa, FL",201 to 500 employees,2002,Company - Private,IT Services,Information Technology,$100 to $500 million (USD),"PMSI, myMatrixx",39,77,58.0


## 2. Company Name

In [8]:
df['company_txt'] = df.apply(lambda x: x['Company Name'] if x['Rating'] <0 else x['Company Name'][:-4],axis=1)
df.head(1)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary,company_txt
0,Business Systems Analyst,$39K-$77K (Glassdoor est.),***This position is eligible for 100% work fro...,3.2,Healthesystems\n3.2,United States,"Tampa, FL",201 to 500 employees,2002,Company - Private,IT Services,Information Technology,$100 to $500 million (USD),"PMSI, myMatrixx",39,77,58.0,Healthesystems


## 3. Location (State)

In [9]:
df['job_state'] = df['Location'].apply(lambda x: x.split(','))
df['job_state'] = df['job_state'].apply(lambda x: x[1] if len(x)==2 else x[0])
df.head(1)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary,company_txt,job_state
0,Business Systems Analyst,$39K-$77K (Glassdoor est.),***This position is eligible for 100% work fro...,3.2,Healthesystems\n3.2,United States,"Tampa, FL",201 to 500 employees,2002,Company - Private,IT Services,Information Technology,$100 to $500 million (USD),"PMSI, myMatrixx",39,77,58.0,Healthesystems,United States


## 4. Is it in the headquarter?

In [10]:
df['is_hq'] = df.apply(lambda x: 1 if x['Headquarters'] == x['Location'] else 0,axis=1)
df.head(1)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary,company_txt,job_state,is_hq
0,Business Systems Analyst,$39K-$77K (Glassdoor est.),***This position is eligible for 100% work fro...,3.2,Healthesystems\n3.2,United States,"Tampa, FL",201 to 500 employees,2002,Company - Private,IT Services,Information Technology,$100 to $500 million (USD),"PMSI, myMatrixx",39,77,58.0,Healthesystems,United States,0


## 5. Age of company

In [11]:
df['Age'] = df['Founded'].apply(lambda x: x if x < 1 else 2020-x)
df.head(2)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary,company_txt,job_state,is_hq,Age
0,Business Systems Analyst,$39K-$77K (Glassdoor est.),***This position is eligible for 100% work fro...,3.2,Healthesystems\n3.2,United States,"Tampa, FL",201 to 500 employees,2002,Company - Private,...,Information Technology,$100 to $500 million (USD),"PMSI, myMatrixx",39,77,58.0,Healthesystems,United States,0,18
1,Business Systems Analyst,$39K-$77K (Glassdoor est.),Client is seeking Business Systems analysts to...,4.5,Peterson Technology Partners\n4.5,"Northbrook, IL","Park Ridge, IL",201 to 500 employees,1997,Contract,...,Information Technology,$25 to $50 million (USD),-1,39,77,58.0,Peterson Technology Partners,IL,0,23


## 6. Job Desc

In [12]:
#Excel
df['excel'] = df['Job Description'].apply(lambda x: 1 if 'excel' in x.lower() else 0)

#SQL
df['SQL'] = df['Job Description'].apply(lambda x: 1 if 'SQL' in x else 0)

#Python
df['python'] = df['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)

#Tableau
df['tableau'] = df['Job Description'].apply(lambda x: 1 if 'tableau' in x.lower() else 0)


In [13]:
df['excel'].value_counts()

1    676
0    324
Name: excel, dtype: int64

## Save the data

In [14]:
df.to_csv('salary_data_cleaned.csv',index=False)