In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests


## 1. level.fyi salary data

### Retrieve level.fyi salary data and save it as csv

In [2]:
# retrieve level.fyi salary data and save it as csv

# data = requests.get('https://www.levels.fyi/js/salaryData.json').json()

# df = pd.DataFrame(data)

# df.to_csv('../Data/level_fyi_salary.csv', index=False)

# df.head()

### Load level.fyi salary data and take a quick look

In [3]:
# load salary data
df = pd.read_csv('../Data/level_fyi_salary.csv')
df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber
0,6/7/2017 11:33:27,Oracle,L3,Product Manager,127.0,"Redwood City, CA",1.5,1.5,,107.0,20.0,10.0,,,7392,807.0,1
1,6/10/2017 17:11:29,eBay,SE 2,Software Engineer,100.0,"San Francisco, CA",5.0,3.0,,,,,,,7419,807.0,2
2,6/11/2017 14:53:57,Amazon,L7,Product Manager,310.0,"Seattle, WA",8.0,0.0,,155.0,,,,,11527,819.0,3
3,6/14/2017 21:22:25,Microsoft,64,Software Engineering Manager,200.0,"Redmond, WA",9.0,9.0,,169000.0,100000.0,30000.0,,,11521,819.0,5
4,6/16/2017 10:44:01,Amazon,L5,Software Engineer,173.0,"Vancouver, BC, Canada",11.0,1.0,,120000.0,0.0,53000.0,,,1320,0.0,6


In [4]:
df.shape

(62642, 17)

#### Work with missing data

In [5]:
df.isnull().sum()  # some columns have a lot of missing data

timestamp                      0
company                        5
level                        115
title                          0
totalyearlycompensation        0
location                       0
yearsofexperience              0
yearsatcompany                 0
tag                          854
basesalary                  2304
stockgrantvalue             2684
bonus                       3988
gender                     19540
otherdetails               22505
cityid                         0
dmaid                          2
rowNumber                      0
dtype: int64

In [6]:
df.isnull().mean() 

timestamp                  0.000000
company                    0.000080
level                      0.001836
title                      0.000000
totalyearlycompensation    0.000000
location                   0.000000
yearsofexperience          0.000000
yearsatcompany             0.000000
tag                        0.013633
basesalary                 0.036780
stockgrantvalue            0.042847
bonus                      0.063663
gender                     0.311931
otherdetails               0.359264
cityid                     0.000000
dmaid                      0.000032
rowNumber                  0.000000
dtype: float64

In [7]:
# explore how to deal with missing data

df1 = df.copy()
print(f'original data size: {df.shape}')

df1.replace({'':np.nan}, inplace=True)
print(f"data size after dropping all na's: {df1.dropna().shape}")
      
df2 = df.copy()
df2 = df2.drop(columns=['gender','otherdetails'])

df2.replace({'':np.nan}, inplace=True)
print(f"data size after dropping 2 columns and then dropping all na's: {df2.dropna().shape}")

original data size: (62642, 17)
data size after dropping all na's: (29980, 17)
data size after dropping 2 columns and then dropping all na's: (57910, 15)


In [8]:
# so the decision is to drop 2 columns (gender,otherdetails), and then drop na's
df.drop(columns=['gender','otherdetails'], inplace=True)
df.replace({'':np.nan}, inplace=True)
df.dropna(inplace=True)
df.shape

(57910, 15)

In [9]:
df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,cityid,dmaid,rowNumber
725,6/3/2018 13:58:20,Yahoo,IC2,Software Engineer,160.5,"Sunnyvale, CA",0.58,0.58,Full Stack,128.0,13.0,19.2,7472,807.0,791
726,6/3/2018 14:54:39,IBM,Staff Engineer,Software Engineer,120.0,"Cambridge, EN, United Kingdom",7.0,2.0,Distributed Systems (Back-End),115000.0,0.0,5000.0,4878,0.0,792
731,6/4/2018 20:28:22,Facebook,E3,Software Engineer,165.0,"Seattle, WA",1.0,1.0,Full Stack,114000.0,37500.0,11400.0,11527,819.0,799
732,6/5/2018 0:56:33,VmWare,Senior MTS,Software Engineer,218.0,"Palo Alto, CA",8.0,1.0,Distributed Systems (Back-End),165.0,100000.0,32000.0,7351,807.0,800
733,6/5/2018 1:19:05,Uber,L4,Software Engineer,240.0,"San Francisco, CA",3.0,0.0,Web Development (Front-End),145000.0,81500.0,13500.0,7419,807.0,801


In [10]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62642 entries, 0 to 62641
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                62642 non-null  object 
 1   company                  62637 non-null  object 
 2   level                    62527 non-null  object 
 3   title                    62642 non-null  object 
 4   totalyearlycompensation  62642 non-null  float64
 5   location                 62642 non-null  object 
 6   yearsofexperience        62642 non-null  float64
 7   yearsatcompany           62642 non-null  float64
 8   tag                      61788 non-null  object 
 9   basesalary               60338 non-null  float64
 10  stockgrantvalue          59958 non-null  float64
 11  bonus                    58654 non-null  float64
 12  cityid                   62642 non-null  int64  
 13  dmaid                    62640 non-null  float64
 14  rowNumber             

In [11]:
df.duplicated().sum()

0

#### Explore on taking a subst of salary data

##### Take a subset of companies

In [12]:
'''
print(len(df['company'].unique()))  # 1871 companies in total

df['company'].value_counts(normalize=True).head(50)
'''

"\nprint(len(df['company'].unique()))  # 1871 companies in total\n\ndf['company'].value_counts(normalize=True).head(50)\n"

In [13]:
# df['company'].value_counts(normalize=True).tail(20)

In [14]:
'''
company_df = pd.DataFrame(df['company'].value_counts(normalize=True))
company_df.rename(columns={'company':'prop'}, inplace=True)
company_df['prop_cum'] = company_df['prop'].cumsum()
company_df
'''

"\ncompany_df = pd.DataFrame(df['company'].value_counts(normalize=True))\ncompany_df.rename(columns={'company':'prop'}, inplace=True)\ncompany_df['prop_cum'] = company_df['prop'].cumsum()\ncompany_df\n"

In [15]:
# company_df.to_csv('../Data/company_list.csv')

In [16]:
'''
print(company_df.loc[company_df['prop_cum'] <=.60, :].shape)
company_df.loc[company_df['prop_cum'] <=.60, :]  # the top 33 companies count for 60% of the salary records

df.shape[0]*0.60  # the top 32 companies count for 60% of the salary records, which is 34,746 rows

print(company_df.loc[company_df['prop_cum'] <=.80, :].shape)  # the top 164 companies count for 80% of the records, which is 46,328 rows
df.shape[0]*0.80
'''

"\nprint(company_df.loc[company_df['prop_cum'] <=.60, :].shape)\ncompany_df.loc[company_df['prop_cum'] <=.60, :]  # the top 33 companies count for 60% of the salary records\n\ndf.shape[0]*0.60  # the top 32 companies count for 60% of the salary records, which is 34,746 rows\n\nprint(company_df.loc[company_df['prop_cum'] <=.80, :].shape)  # the top 164 companies count for 80% of the records, which is 46,328 rows\ndf.shape[0]*0.80\n"

##### Take a subset of years

In [17]:
'''
# convert timestamp from string to datetime column
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['timestamp']

df['timestamp'].sort_values()  # 2018-06-03 13:58:20 to 2021-08-17 08:28:57

df['timestamp'].dt.year.value_counts().sort_index(ascending=False)

df['timestamp'].dt.year.value_counts(normalize=True).sort_index(ascending=False)
'''

"\n# convert timestamp from string to datetime column\ndf['timestamp'] = pd.to_datetime(df['timestamp'])\ndf['timestamp']\n\ndf['timestamp'].sort_values()  # 2018-06-03 13:58:20 to 2021-08-17 08:28:57\n\ndf['timestamp'].dt.year.value_counts().sort_index(ascending=False)\n\ndf['timestamp'].dt.year.value_counts(normalize=True).sort_index(ascending=False)\n"

In [21]:
# create year and month
df['timestamp'] = pd.to_datetime(df['timestamp'])

df['year'] = df['timestamp'].copy().dt.year
df['month'] = df['timestamp'].copy().dt.month
df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,cityid,dmaid,rowNumber,year,month
725,2018-06-03 13:58:20,Yahoo,IC2,Software Engineer,160.5,"Sunnyvale, CA",0.58,0.58,Full Stack,128.0,13.0,19.2,7472,807.0,791,2018,6
726,2018-06-03 14:54:39,IBM,Staff Engineer,Software Engineer,120.0,"Cambridge, EN, United Kingdom",7.0,2.0,Distributed Systems (Back-End),115000.0,0.0,5000.0,4878,0.0,792,2018,6
731,2018-06-04 20:28:22,Facebook,E3,Software Engineer,165.0,"Seattle, WA",1.0,1.0,Full Stack,114000.0,37500.0,11400.0,11527,819.0,799,2018,6
732,2018-06-05 00:56:33,VmWare,Senior MTS,Software Engineer,218.0,"Palo Alto, CA",8.0,1.0,Distributed Systems (Back-End),165.0,100000.0,32000.0,7351,807.0,800,2018,6
733,2018-06-05 01:19:05,Uber,L4,Software Engineer,240.0,"San Francisco, CA",3.0,0.0,Web Development (Front-End),145000.0,81500.0,13500.0,7419,807.0,801,2018,6


In [22]:
pd.DataFrame(df['year'].value_counts()).sort_index(ascending=False)  # salary data are from 2018 to 2021
# pd.DataFrame(df[['year','month']].value_counts(normalize=True)).sort_index(ascending=False)  # salary data are from 2018 to 2021

Unnamed: 0,year
2021,23342
2020,23859
2019,8435
2018,2274


In [29]:
# create state
df['state'] = df['location'].apply(lambda x: x.split(',')[-1].strip())
df[['location','state']]

Unnamed: 0,location,state
725,"Sunnyvale, CA",CA
726,"Cambridge, EN, United Kingdom",United Kingdom
731,"Seattle, WA",WA
732,"Palo Alto, CA",CA
733,"San Francisco, CA",CA
...,...,...
62637,"Seattle, WA",WA
62638,"Durham, NC",NC
62639,"San Jose, CA",CA
62640,"New York, NY",NY


In [32]:
df['state'].value_counts().head(60)  # there are foreign countries in location

CA                      20846
WA                      10914
NY                       4420
India                    2768
TX                       2584
Canada                   1703
MA                       1612
United Kingdom           1138
VA                        878
IL                        849
OR                        600
DC                        564
Germany                   561
GA                        558
CO                        547
NC                        483
PA                        462
NJ                        449
Singapore                 369
AZ                        368
Taiwan                    302
Ireland                   295
FL                        284
Israel                    279
MN                        268
Australia                 240
Russia                    239
Netherlands               238
UT                        219
MI                        210
OH                        190
MO                        181
Switzerland               176
WI        

In [28]:
'Cambridge, EN, United Kingdom'.split(',')[-1].strip()

'United Kingdom'

In [33]:
df.to_csv('../Data/salary_noNA.csv', index=False)

## 2. Alpha Advantage: Company Overview
- Symbol
- Sector
- Industry
- Address

In [34]:
url = 'https://www.alphavantage.co/query?function=OVERVIEW&symbol=IBM&apikey=demo'
r = requests.get(url)
data = r.json()

print(data)

{'Symbol': 'IBM', 'AssetType': 'Common Stock', 'Name': 'International Business Machines Corporation', 'Description': 'International Business Machines Corporation (IBM) is an American multinational technology company headquartered in Armonk, New York, with operations in over 170 countries. The company began in 1911, founded in Endicott, New York, as the Computing-Tabulating-Recording Company (CTR) and was renamed International Business Machines in 1924. IBM is incorporated in New York. IBM produces and sells computer hardware, middleware and software, and provides hosting and consulting services in areas ranging from mainframe computers to nanotechnology. IBM is also a major research organization, holding the record for most annual U.S. patents generated by a business (as of 2020) for 28 consecutive years. Inventions by IBM include the automated teller machine (ATM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming language, th

In [35]:
# key = 'your_personal_key' # put your key here

def get_company_alpha_advantage(companies, function):
    mylist = []
    for company in companies:
        base = 'https://www.alphavantage.co/query?'
        result = requests.get(url = base,
                             params = {
                                 'function':function,
                                 'symbol':company,
                                 'apikey':key
                             })
        data = result.json()
        mylist.append(data)
    return mylist

In [36]:
companies = ['IBM','AMZN','AApl']
function = 'OVERVIEW'

data = get_company_alpha_advantage(companies, function)

In [37]:
overview = pd.DataFrame(data)
overview

Unnamed: 0,Symbol,AssetType,Name,Description,CIK,Exchange,Currency,Country,Sector,Industry,...,EVToRevenue,EVToEBITDA,Beta,52WeekHigh,52WeekLow,50DayMovingAverage,200DayMovingAverage,SharesOutstanding,DividendDate,ExDividendDate
0,IBM,Common Stock,International Business Machines Corporation,International Business Machines Corporation (I...,51143,NYSE,USD,USA,TECHNOLOGY,COMPUTER & OFFICE EQUIPMENT,...,2.178,12.51,1.104,142.43,106.73,122.51,131.54,896320000,2021-12-10,2021-11-09
1,AMZN,Common Stock,Amazon.com Inc,"Amazon.com, Inc. is an American multinational ...",1018724,NASDAQ,USD,USA,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,...,3.89,27.48,1.126,3773.08,2881.0,3458.85,3380.84,507148000,,
2,AAPL,Common Stock,Apple Inc,Apple Inc. is an American multinational techno...,320193,NASDAQ,USD,USA,TECHNOLOGY,ELECTRONIC COMPUTERS,...,8.22,24.41,1.203,182.13,115.67,160.29,143.13,16406400000,2021-11-11,2021-11-05


In [38]:
# https://stackoverflow.com/questions/19726029/how-can-i-make-pandas-dataframe-column-headers-all-lowercase

overview = overview[['Symbol','Sector','Industry','Address']]
overview.columns = map(str.lower, overview.columns)  
overview

Unnamed: 0,symbol,sector,industry,address
0,IBM,TECHNOLOGY,COMPUTER & OFFICE EQUIPMENT,"1 NEW ORCHARD ROAD, ARMONK, NY, US"
1,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US"
2,AAPL,TECHNOLOGY,ELECTRONIC COMPUTERS,"ONE INFINITE LOOP, CUPERTINO, CA, US"


In [39]:
overview.to_csv('../Data/overview.csv', index=False)

### 3. Alpha Advantage: Inflation (annual)

In [40]:
# key = 'your_personal_key' # put your key here

url = 'https://www.alphavantage.co/query?function=INFLATION&apikey='+key

r = requests.get(url)
data = r.json()

inflation = pd.DataFrame(data['data'])
inflation

Unnamed: 0,date,value
0,2020-01-01,1.23358439630637
1,2019-01-01,1.81221007526015
2,2018-01-01,2.44258329692818
3,2017-01-01,2.13011000365963
4,2016-01-01,1.26158320570537
...,...,...
56,1964-01-01,1.27891156462583
57,1963-01-01,1.2396694214876
58,1962-01-01,1.19877334820185
59,1961-01-01,1.07072414764723


In [41]:
inflation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    61 non-null     object
 1   value   61 non-null     object
dtypes: object(2)
memory usage: 1.1+ KB


In [42]:
inflation['date'] = pd.to_datetime(inflation['date'])

inflation['value'] = inflation['value'].astype(float)

inflation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    61 non-null     datetime64[ns]
 1   value   61 non-null     float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 1.1 KB


In [43]:
inflation['year'] = inflation['date'].copy().dt.year

inflation['month'] = inflation['date'].copy().dt.month

inflation.head()

Unnamed: 0,date,value,year,month
0,2020-01-01,1.233584,2020,1
1,2019-01-01,1.81221,2019,1
2,2018-01-01,2.442583,2018,1
3,2017-01-01,2.13011,2017,1
4,2016-01-01,1.261583,2016,1


In [44]:
inflation.to_csv('../Data/inflation.csv', index=False)

In [35]:
test = pd.read_csv('../Data/inflation.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    61 non-null     object 
 1   value   61 non-null     float64
 2   year    61 non-null     int64  
 3   month   61 non-null     int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 2.0+ KB


### 4. Alpha Advantage: Unemployment rate (monthly)

In [46]:
# key = 'your_personal_key' # put your key here

url = 'https://www.alphavantage.co/query?function=UNEMPLOYMENT&apikey='+key
r = requests.get(url)
data = r.json()

unemp = pd.DataFrame(data['data'])
unemp

Unnamed: 0,date,value
0,2021-11-01,4.2
1,2021-10-01,4.6
2,2021-09-01,4.8
3,2021-08-01,5.2
4,2021-07-01,5.4
...,...,...
882,1948-05-01,3.5
883,1948-04-01,3.9
884,1948-03-01,4.0
885,1948-02-01,3.8


In [47]:
unemp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    887 non-null    object
 1   value   887 non-null    object
dtypes: object(2)
memory usage: 14.0+ KB


In [48]:
unemp['date'] = pd.to_datetime(unemp['date'])

unemp['value'] = unemp['value'].astype(float)

unemp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    887 non-null    datetime64[ns]
 1   value   887 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 14.0 KB


In [49]:
unemp['year'] = unemp['date'].copy().dt.year

In [50]:
unemp['month'] = unemp['date'].copy().dt.month

In [38]:
unemp.rename(columns={'value':'unemp_rate'}, inplace=True)

In [39]:
unemp.head()

Unnamed: 0,date,unemp_rate,year,month
0,2021-11-01,4.2,2021,11
1,2021-10-01,4.6,2021,10
2,2021-09-01,4.8,2021,9
3,2021-08-01,5.2,2021,8
4,2021-07-01,5.4,2021,7


In [40]:
unemp.to_csv('../Data/unemployment.csv', index=False)

### 5. Data World: Employment / unemployment counts monthly by state 1/1976-9/2020

Employment & Unemployment by State and Month - dataset by vizwiz

https://data.world/vizwiz/employment-unemployment-by-state-and-month/workspace/file?filename=BLS+Monthly+Unemployment+Rate.xlsx

In [None]:
employ = 

In [29]:
# Load mapping of state names to state abbreviation

import json

with open('https://gist.githubusercontent.com/AnnieW2014/d5c59b029307cce0e18cf7bf2dd0f93b/raw/6591f2de775a139fa6f31b4e3d991e12133be9d6/us_state_abbrev.py') as f:
    data = f.read()
    
print("Data type before reconstruction : ", type(data))
      




FileNotFoundError: [Errno 2] No such file or directory: 'https://gist.githubusercontent.com/AnnieW2014/d5c59b029307cce0e18cf7bf2dd0f93b/raw/6591f2de775a139fa6f31b4e3d991e12133be9d6/us_state_abbrev.py'

In [None]:
# reconstructing the data as a dictionary
js = json.loads(data)
  
print("Data type after reconstruction : ", type(js))
print(js)

In [None]:
state_mapping = pd.read_csv('https://gist.githubusercontent.com/AnnieW2014/d5c59b029307cce0e18cf7bf2dd0f93b/raw/6591f2de775a139fa6f31b4e3d991e12133be9d6/us_state_abbrev.py')
state_mapping

### 5. Merge all datasets together

#### Load salary data, create year and month from timestamp, and state from location 

In [25]:
# load salary data
salary = pd.read_csv('../Data/salary_noNA.csv')
print(salary.shape)

salary.head()

(57910, 15)


Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,cityid,dmaid,rowNumber
0,2018-06-03 13:58:20,Yahoo,IC2,Software Engineer,160.5,"Sunnyvale, CA",0.58,0.58,Full Stack,128.0,13.0,19.2,7472,807.0,791
1,2018-06-03 14:54:39,IBM,Staff Engineer,Software Engineer,120.0,"Cambridge, EN, United Kingdom",7.0,2.0,Distributed Systems (Back-End),115000.0,0.0,5000.0,4878,0.0,792
2,2018-06-04 20:28:22,Facebook,E3,Software Engineer,165.0,"Seattle, WA",1.0,1.0,Full Stack,114000.0,37500.0,11400.0,11527,819.0,799
3,2018-06-05 00:56:33,VmWare,Senior MTS,Software Engineer,218.0,"Palo Alto, CA",8.0,1.0,Distributed Systems (Back-End),165.0,100000.0,32000.0,7351,807.0,800
4,2018-06-05 01:19:05,Uber,L4,Software Engineer,240.0,"San Francisco, CA",3.0,0.0,Web Development (Front-End),145000.0,81500.0,13500.0,7419,807.0,801


In [26]:
salary['timestamp'] = pd.to_datetime(salary['timestamp'])
salary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57910 entries, 0 to 57909
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   timestamp                57910 non-null  datetime64[ns]
 1   company                  57910 non-null  object        
 2   level                    57910 non-null  object        
 3   title                    57910 non-null  object        
 4   totalyearlycompensation  57910 non-null  float64       
 5   location                 57910 non-null  object        
 6   yearsofexperience        57910 non-null  float64       
 7   yearsatcompany           57910 non-null  float64       
 8   tag                      57910 non-null  object        
 9   basesalary               57910 non-null  float64       
 10  stockgrantvalue          57910 non-null  float64       
 11  bonus                    57910 non-null  float64       
 12  cityid                   57910 n

In [27]:
'''
# create salary_year and salary_month
salary['year'] = salary['timestamp'].copy().dt.year
salary['month'] = salary['timestamp'].copy().dt.month
salary.head()
'''

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,cityid,dmaid,rowNumber,year,month
0,2018-06-03 13:58:20,Yahoo,IC2,Software Engineer,160.5,"Sunnyvale, CA",0.58,0.58,Full Stack,128.0,13.0,19.2,7472,807.0,791,2018,6
1,2018-06-03 14:54:39,IBM,Staff Engineer,Software Engineer,120.0,"Cambridge, EN, United Kingdom",7.0,2.0,Distributed Systems (Back-End),115000.0,0.0,5000.0,4878,0.0,792,2018,6
2,2018-06-04 20:28:22,Facebook,E3,Software Engineer,165.0,"Seattle, WA",1.0,1.0,Full Stack,114000.0,37500.0,11400.0,11527,819.0,799,2018,6
3,2018-06-05 00:56:33,VmWare,Senior MTS,Software Engineer,218.0,"Palo Alto, CA",8.0,1.0,Distributed Systems (Back-End),165.0,100000.0,32000.0,7351,807.0,800,2018,6
4,2018-06-05 01:19:05,Uber,L4,Software Engineer,240.0,"San Francisco, CA",3.0,0.0,Web Development (Front-End),145000.0,81500.0,13500.0,7419,807.0,801,2018,6


SyntaxError: invalid syntax (<ipython-input-33-1255f8bca4b9>, line 2)

In [60]:
# load inflation (annual)
inflation = pd.read_csv('Data/inflation.csv')
inflation.head()

Unnamed: 0,date,value,year,month
0,2020-01-01,1.233584,2020,1
1,2019-01-01,1.81221,2019,1
2,2018-01-01,2.442583,2018,1
3,2017-01-01,2.13011,2017,1
4,2016-01-01,1.261583,2016,1


In [61]:
# load unemployment rate (monthly)
unemp = pd.read_csv('Data/unemployment.csv')
unemp.head()

Unnamed: 0,date,value,year,month
0,2021-11-01,4.2,2021,11
1,2021-10-01,4.6,2021,10
2,2021-09-01,4.8,2021,9
3,2021-08-01,5.2,2021,8
4,2021-07-01,5.4,2021,7


In [62]:
# merge salary and company_list_AW
final = pd.merge(left=salary, right=company[['company','symbol']], on='company',
                how='inner')
final.loc[final['company']=='Amazon', :].head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,cityid,dmaid,rowNumber,salary_year,salary_month,symbol
19551,2018-06-08 06:40:35,Amazon,L5,Software Engineer,210.0,"Boston, MA",12.0,4.0,Distributed Systems (Back-End),143.0,180.0,30.0,8816,506.0,847,2018,6,AMZN
19552,2018-06-09 21:24:59,Amazon,L6,Software Engineering Manager,319.0,"Seattle, WA",6.0,1.0,DevOps,140000.0,380000.0,280000.0,11527,819.0,866,2018,6,AMZN
19553,2018-06-10 20:23:37,Amazon,L4,Software Engineer,135.0,"Seattle, WA",2.0,0.0,API Development (Back-End),105.0,5.0,25.0,11527,819.0,872,2018,6,AMZN
19554,2018-06-13 18:59:55,Amazon,L5,Software Engineer,202.0,"Seattle, WA",2.5,0.0,Distributed Systems (Back-End),140000.0,150000.0,110000.0,11527,819.0,904,2018,6,AMZN
19555,2018-06-15 22:48:07,Amazon,L5,Software Engineer,160.0,"Vancouver, BC, Canada",10.0,1.0,Full Stack,130.0,0.0,30.0,1320,0.0,929,2018,6,AMZN


In [63]:
# merge in company overview
final = pd.merge(left=final, right=overview, on='symbol',
                how='left')
final.loc[final['company']=='Amazon', :].head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,...,bonus,cityid,dmaid,rowNumber,salary_year,salary_month,symbol,sector,industry,address
19551,2018-06-08 06:40:35,Amazon,L5,Software Engineer,210.0,"Boston, MA",12.0,4.0,Distributed Systems (Back-End),143.0,...,30.0,8816,506.0,847,2018,6,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US"
19552,2018-06-09 21:24:59,Amazon,L6,Software Engineering Manager,319.0,"Seattle, WA",6.0,1.0,DevOps,140000.0,...,280000.0,11527,819.0,866,2018,6,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US"
19553,2018-06-10 20:23:37,Amazon,L4,Software Engineer,135.0,"Seattle, WA",2.0,0.0,API Development (Back-End),105.0,...,25.0,11527,819.0,872,2018,6,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US"
19554,2018-06-13 18:59:55,Amazon,L5,Software Engineer,202.0,"Seattle, WA",2.5,0.0,Distributed Systems (Back-End),140000.0,...,110000.0,11527,819.0,904,2018,6,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US"
19555,2018-06-15 22:48:07,Amazon,L5,Software Engineer,160.0,"Vancouver, BC, Canada",10.0,1.0,Full Stack,130.0,...,30.0,1320,0.0,929,2018,6,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US"


In [64]:
# merge in inflation - need to clean up!!!
test = pd.merge(left=final, right=inflation,
                left_on=['salary_year','salary_month'], right_on=['year','month'],
                how='left')
test.loc[(test['company']=='Amazon') & (test['year']==2020), :].head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,...,salary_year,salary_month,symbol,sector,industry,address,date,value,year,month
20634,2020-01-01 11:44:06,Amazon,L5,Software Engineer,244.0,"Seattle, WA",6.0,6.0,API Development (Back-End),129.0,...,2020,1,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US",2020-01-01,1.233584,2020.0,1.0
20635,2020-01-01 17:30:42,Amazon,L7,Data Scientist,500.0,"Seattle, WA",9.0,5.0,Networking,180.0,...,2020,1,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US",2020-01-01,1.233584,2020.0,1.0
20636,2020-01-01 18:03:18,Amazon,L7,Product Manager,320.0,"Seattle, WA",10.0,5.0,ML / AI,150.0,...,2020,1,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US",2020-01-01,1.233584,2020.0,1.0
20637,2020-01-01 18:47:43,Amazon,Senior SDE,Software Engineer,271.0,"Seattle, WA",8.0,4.0,Full Stack,160.0,...,2020,1,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US",2020-01-01,1.233584,2020.0,1.0
20638,2020-01-01 22:59:12,Amazon,L4,Software Engineer,144.0,"Seattle, WA",4.0,0.0,Distributed Systems (Back-End),110.0,...,2020,1,AMZN,TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,"410 TERRY AVENUE NORTH, SEATTLE, WA, US",2020-01-01,1.233584,2020.0,1.0
