# Data Cleaning

In [43]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
df = pd.read_csv('glassdoor_jobs_1.csv')

In [45]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,Business Analyst,$39K-$77K (Glassdoor est.),Are you ready to be part of a mission that is ...,4.0,Pushpay\n4.0,"Redmond, WA","Redmond, WA",201 to 500 employees,2011,Company - Public,Computer Hardware & Software,Information Technology,Unknown / Non-Applicable,"Subsplash, Ministry Brands, Smartsheet"
1,Business Analyst,$39K-$77K (Glassdoor est.),"Arteriors is a team of smart, imaginative (and...",3.9,Arteriors Home\n3.9,"Carrollton, TX","Carrollton, TX",51 to 200 employees,1987,Company - Private,Consumer Products Manufacturing,Manufacturing,Unknown / Non-Applicable,-1
2,Strategy & Analytics Analyst,$39K-$77K (Glassdoor est.),"Job Summary\nResponsible for researching, anal...",2.4,CSX\n2.4,"Jacksonville, FL","Jacksonville, FL",10000+ employees,1978,Company - Public,Transportation Management,Transportation & Logistics,$10+ billion (USD),"Union Pacific, BNSF Railway"
3,Business Systems Analyst,$39K-$77K (Glassdoor est.),***This position is eligible for 100% work fro...,3.2,Healthesystems\n3.2,United States,"Tampa, FL",201 to 500 employees,2002,Company - Private,IT Services,Information Technology,$100 to $500 million (USD),"PMSI, myMatrixx"
4,IT Business Analyst,$39K-$77K (Glassdoor est.),IT Business Analyst\nCorporate Headquarters\n1...,3.1,Uline\n3.1,"Pleasant Prairie, WI","Pleasant Prairie, WI",5001 to 10000 employees,1980,Company - Private,Wholesale,Business Services,Unknown / Non-Applicable,"The Home Depot, Foxconn, Amazon"


In [46]:
#Remove Salary Estimate = -1
df =df[ df['Salary Estimate'] != '-1']

### To do
- Salary Parsing
- Company name text only
- Location
- Age of company
- job desc parsing

## 1. Parsing Salary

In [47]:
salary = df['Salary Estimate'].apply(lambda x: x.split('(')[0])

#remove K and $ sign
salary = salary.apply(lambda x: x.replace('K','').replace('$',''))
salary.unique()

array(['39-77 ', '70-85', '39-78 ', '32-64 ', '76-90 ', '37-38 ',
       '34-67 ', '41-80 ', '50-65 ', '48-93 ', '53-95 ', '51-94 ',
       '43-82 '], dtype=object)

In [48]:
df['min_salary'] = salary.apply(lambda x: int(x.split('-')[0]))
df['max_salary'] = salary.apply(lambda x: int(x.split('-')[1]))
df['avg_salary'] = (df['min_salary']+df['max_salary'])/2
df.head(1)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary
0,Business Analyst,$39K-$77K (Glassdoor est.),Are you ready to be part of a mission that is ...,4.0,Pushpay\n4.0,"Redmond, WA","Redmond, WA",201 to 500 employees,2011,Company - Public,Computer Hardware & Software,Information Technology,Unknown / Non-Applicable,"Subsplash, Ministry Brands, Smartsheet",39,77,58.0


## 2. Company Name

In [49]:
df['company_txt'] = df.apply(lambda x: x['Company Name'] if x['Rating'] <0 else x['Company Name'][:-4],axis=1)
df.head(1)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary,company_txt
0,Business Analyst,$39K-$77K (Glassdoor est.),Are you ready to be part of a mission that is ...,4.0,Pushpay\n4.0,"Redmond, WA","Redmond, WA",201 to 500 employees,2011,Company - Public,Computer Hardware & Software,Information Technology,Unknown / Non-Applicable,"Subsplash, Ministry Brands, Smartsheet",39,77,58.0,Pushpay


## 3. Location (State)

In [50]:
df['job_state'] = df['Location'].apply(lambda x: x.split(','))
df['job_state'] = df['job_state'].apply(lambda x: x[1] if len(x)==2 else x[0])
df.head(1)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary,company_txt,job_state
0,Business Analyst,$39K-$77K (Glassdoor est.),Are you ready to be part of a mission that is ...,4.0,Pushpay\n4.0,"Redmond, WA","Redmond, WA",201 to 500 employees,2011,Company - Public,Computer Hardware & Software,Information Technology,Unknown / Non-Applicable,"Subsplash, Ministry Brands, Smartsheet",39,77,58.0,Pushpay,WA


## 4. Is it in the headquarter?

In [51]:
df['is_hq'] = df.apply(lambda x: 1 if x['Headquarters'] == x['Location'] else 0,axis=1)
df.head(1)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary,company_txt,job_state,is_hq
0,Business Analyst,$39K-$77K (Glassdoor est.),Are you ready to be part of a mission that is ...,4.0,Pushpay\n4.0,"Redmond, WA","Redmond, WA",201 to 500 employees,2011,Company - Public,Computer Hardware & Software,Information Technology,Unknown / Non-Applicable,"Subsplash, Ministry Brands, Smartsheet",39,77,58.0,Pushpay,WA,1


## 5. Age of company

In [52]:
df['Age'] = df['Founded'].apply(lambda x: x if x < 1 else 2020-x)
df.head(2)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary,company_txt,job_state,is_hq,Age
0,Business Analyst,$39K-$77K (Glassdoor est.),Are you ready to be part of a mission that is ...,4.0,Pushpay\n4.0,"Redmond, WA","Redmond, WA",201 to 500 employees,2011,Company - Public,...,Information Technology,Unknown / Non-Applicable,"Subsplash, Ministry Brands, Smartsheet",39,77,58.0,Pushpay,WA,1,9
1,Business Analyst,$39K-$77K (Glassdoor est.),"Arteriors is a team of smart, imaginative (and...",3.9,Arteriors Home\n3.9,"Carrollton, TX","Carrollton, TX",51 to 200 employees,1987,Company - Private,...,Manufacturing,Unknown / Non-Applicable,-1,39,77,58.0,Arteriors Home,TX,1,33


## 6. Job Desc

In [57]:
#Excel
df['excel'] = df['Job Description'].apply(lambda x: 1 if 'excel' in x.lower() else 0)

#SQL
df['SQL'] = df['Job Description'].apply(lambda x: 1 if 'SQL' in x else 0)

#Python
df['python'] = df['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)

#Tableau
df['tableau'] = df['Job Description'].apply(lambda x: 1 if 'tableau' in x.lower() else 0)


In [58]:
df['excel'].value_counts()

1    269
0    131
Name: excel, dtype: int64

## Save the data

In [60]:
df.to_csv('salary_data_cleaned.csv',index=False)