## Data Scraping

In [40]:
import urllib
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt
import string as st
import seaborn as sns

In [41]:
def top_2000_companies():
    
    r = urllib.request.urlopen('https://h1bdata.info/topcompanies.php') 
    soup = BeautifulSoup(r)

    data2 = soup.find_all('tr')    
    labels = []
    #get headings
    for h in data2[0].find_all('th'):
        labels.append(h.get_text().strip().lower())   
    final = []
    #get data
    for data in data2[1:]:
        data_list = []
        for d in data.find_all('td'):
            d_str = d.get_text().replace(',','')
            
            if d_str.isnumeric():
                data_list.append(int(d_str))
            else:
                data_list.append(d_str)                      
        final.append(data_list)
    
    df = pd.DataFrame(final, columns = labels) 
    df =df.drop(columns='latest filings')
    df = df.rename(columns={"#": "index", "company name": "company","# of h-1b filings": "total_filings","average salary":"avg_salary"})
    df = df.set_index('index')
    return df

In [42]:
df_companies = top_2000_companies()
df_companies = df_companies.dropna(how='any')
df_companies.to_csv('data/top2kcompanynames.csv')
df_companies

Unnamed: 0_level_0,company,total_filings,avg_salary
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,INFOSYS LIMITED,117594.0,$83498
2,TATA CONSULTANCY SERVICES LIMITED,91518.0,$71575
3,COGNIZANT TECHNOLOGY SOLUTIONS US CORP,80977.0,$87624
4,DELOITTE CONSULTING LLP,58320.0,$93334
5,CAPGEMINI AMERICA INC,51256.0,$85797
...,...,...,...
1996,RISK MANAGEMENT SOLUTIONS INC,275.0,$116647
1997,INCANDESCENT TECHNOLOGIES INC,275.0,$85588
1998,DATAQUAD INC,275.0,$89346
1999,SUNRISE INFOTEK CORP,274.0,$79106


In [43]:
def load_data(company,year):
    company = company.replace(' ','+')
    #print('https://h1bdata.info/index.php?em='+company+'&job=&city=&year='+str(year))
    #print(company)
    r = urllib.request.urlopen('https://h1bdata.info/index.php?em='+company+'&job=&city=&year='+str(year)) 
    soup = BeautifulSoup(r)

    data2 = soup.find_all('tr')    
#     labels = []
#     for h in data2[0].find_all('th'):
#         labels.append(h.get_text().strip().lower())

    final = []
    for data in data2[1:]:
        data_list = []
        for d in data.find_all('td'):
            d_str = d.get_text().replace(',','')

            if d_str.isnumeric():
                data_list.append(int(d_str))
            else:
                data_list.append(d_str)                      
        final.append(data_list)
    
    df = pd.DataFrame(final)    
    return df

In [34]:
#data extraction - don't run everytime
step =100
for i in range(0,2000,step):
    frames = [load_data(company,year) for company in df_companies['company'].dropna().to_list()[i:i+step] for year in range(2012,2022) ]  
    result = pd.concat(frames)
    result.to_csv(f"data/{i}-{i+step}.csv")
    print("done "+str(i+step)+" companies")

done 500 companies


## Data Loading

In [44]:
import glob
all_files = glob.glob('/Users/sukantoguha/Downloads/h1banalysis/data/scraped_data_in_parts/*.csv')
result = pd.concat((pd.read_csv(f,index_col=0) for f in all_files))

In [46]:
result.columns = ['employer','job_title','base_salary','location','submit_date','start_date']
result['submit_date'] = pd.to_datetime(result['submit_date'])
result['start_date'] = pd.to_datetime(result['start_date'])
#result['state'] = result['location'].str.split().str[-1] 
#result['year'] = result['submit_date'].dt.year

In [53]:
result.dropna(how='any')
result

Unnamed: 0,employer,job_title,base_salary,location,submit_date,start_date
0,MCAFEE LLC,APPLICATION DEVELOPER,105500.0,SANTA CLARA CA,2017-11-30,2017-12-06
1,MCAFEE LLC,APPLICATION DEVELOPER,107806.0,PLANO TX,2017-08-09,2018-02-05
2,MCAFEE LLC,DATA ANALYST,78957.0,PLANO TX,2017-03-10,2017-09-07
3,MCAFEE LLC,DATA ANALYST,90776.0,SANTA CLARA CA,2017-03-09,2017-09-08
4,MCAFEE LLC,DATA SCIENTIST,123071.0,SANTA CLARA CA,2017-03-20,2017-09-09
...,...,...,...,...,...,...
40868,WIPRO LIMITED,VLSI LEAD-L1,110802.0,SUNNYVALE CA,2017-07-28,2017-07-28
40869,WIPRO LIMITED,VLSI LEAD-L1,110802.0,SUNNYVALE CA,2018-03-09,2018-08-01
40870,WIPRO LIMITED,VLSI LEAD-L1,110802.0,SUNNYVALE CA,2017-08-23,2017-08-23
40871,WIPRO LIMITED,VLSI LEAD-L1,110802.0,SUNNYVALE CA,2018-02-13,2018-02-13


In [58]:
result.groupby('employer').count().sort_values(by='location',ascending=False).head(10)

Unnamed: 0_level_0,job_title,base_salary,location,submit_date,start_date
employer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DELOITTE CONSULTING LLP,221693,221693,221693,221693,221693
INFOSYS LIMITED,127370,127370,127370,127370,127370
ERNST & YOUNG US LLP,92492,92540,92540,92540,92540
TATA CONSULTANCY SERVICES LIMITED,91453,91455,91455,91455,91455
COGNIZANT TECHNOLOGY SOLUTIONS US CORP,80789,80789,80789,80789,80789
WIPRO LIMITED,79089,79091,79091,79091,79091
LARSEN & TOUBRO INFOTECH LIMITED,69260,69260,69260,69260,69260
IBM INDIA PRIVATE LIMITED,67201,67207,67207,67207,67207
TECH MAHINDRA (AMERICAS)INC,52714,52716,52716,52716,52716
CAPGEMINI AMERICA INC,49776,49805,49805,49805,49805


In [57]:
result[result.employer == 'SALESFORCECOM INC'].base_salary.mean()

128723.99064940403

In [None]:
#result = pd.read_csv("data/1-10.csv")

In [None]:
#sns.scatterplot(data= result.groupby('year').count().reset_index(),x='year',y='employer')