In [51]:
import pandas as pd

In [52]:
salary_df = pd.read_csv('./data/Salary Dataset.csv')
salary_df = salary_df.dropna()
salary_df.head()

Unnamed: 0,Company Name,Job Title,Salaries Reported,Location,Salary
0,Mu Sigma,Data Scientist,105.0,Bangalore,"₹6,48,573/yr"
1,IBM,Data Scientist,95.0,Bangalore,"₹11,91,950/yr"
2,Tata Consultancy Services,Data Scientist,66.0,Bangalore,"₹8,36,874/yr"
3,Impact Analytics,Data Scientist,40.0,Bangalore,"₹6,69,578/yr"
4,Accenture,Data Scientist,32.0,Bangalore,"₹9,44,110/yr"


### Check data types. 
We will need to preprocess salary before converting to float

In [53]:
salary_df.dtypes

Company Name          object
Job Title             object
Salaries Reported    float64
Location              object
Salary                object
dtype: object

### Best practice to remove white space from column names

In [54]:
salary_df.columns = salary_df.columns.str.replace(' ', '_')

In [55]:
salary_df.head()

Unnamed: 0,Company_Name,Job_Title,Salaries_Reported,Location,Salary
0,Mu Sigma,Data Scientist,105.0,Bangalore,"₹6,48,573/yr"
1,IBM,Data Scientist,95.0,Bangalore,"₹11,91,950/yr"
2,Tata Consultancy Services,Data Scientist,66.0,Bangalore,"₹8,36,874/yr"
3,Impact Analytics,Data Scientist,40.0,Bangalore,"₹6,69,578/yr"
4,Accenture,Data Scientist,32.0,Bangalore,"₹9,44,110/yr"


### Remove characters, check pay frequency, make float, convert to USD

In [56]:
exchange_rate = 0.013 # INR to USD rate

In [57]:
to_numeric = lambda x: float(''.join([i for i in x if i.isdigit()]))

In [58]:
salaries = salary_df['Salary']

salaries_conv = []
for s in salaries:
    s = s[1:].replace(',', '') # remove currency symbol and commas
    sal, freq = s.split('/') # split value of salary and pay frequency
    sal = to_numeric(sal) # use lambda function to ensure that the salary valoue is numeric
    if freq == 'mo':
        sal = sal * 12 # assume working 12 months a year
    elif freq == 'hr':
        sal = sal * 2048 # assume standard 2048 hours in a M-F 9-5 job
    elif freq == 'yr':
        pass # no conversion needed
    else:
        print(f'freq {freq} not in condition statement') # Check to see if any frequencies were missed
        
    sal = sal * exchange_rate # convert INR to USD
    salaries_conv.append(sal)
    
salary_df['Salary_USD_Yr'] = salaries_conv

In [59]:
salary_df.head()

Unnamed: 0,Company_Name,Job_Title,Salaries_Reported,Location,Salary,Salary_USD_Yr
0,Mu Sigma,Data Scientist,105.0,Bangalore,"₹6,48,573/yr",8431.449
1,IBM,Data Scientist,95.0,Bangalore,"₹11,91,950/yr",15495.35
2,Tata Consultancy Services,Data Scientist,66.0,Bangalore,"₹8,36,874/yr",10879.362
3,Impact Analytics,Data Scientist,40.0,Bangalore,"₹6,69,578/yr",8704.514
4,Accenture,Data Scientist,32.0,Bangalore,"₹9,44,110/yr",12273.43
