### Import Libraries

In [None]:
# import libraries and set max column width as 100 to display 
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly
import getpass 
import pandas as pd
import os

plotly.tools.set_credentials_file(username='jonamjar', api_key=getpass.getpass())
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

### Get Data

In [None]:
# Importing Data and display fill 

data_h1 = pd.read_csv("../input/H-1B_Disclosure_Data_FY17.csv", low_memory=False, index_col=0)
print("%s MB" %(os.path.getsize('../input/H-1B_Disclosure_Data_FY17.csv') / 1000000))

### Data Exploration

In [None]:
# display all column Names

print("Number of features: %d\n" %len(data_h1.columns))
print("##### All features ######")
for col in data_h1.columns:
    print(col)

In [None]:
# Proportions of all available Visa types in the data set 

data_h1.VISA_CLASS.value_counts(normalize = True) * 100

In [None]:
# Getting rid of non h1b visa and checking top 10 states based on h1b visa counts

data_h1 = data_h1[data_h1["VISA_CLASS"] == "H-1B"]
pd.DataFrame(data_h1.EMPLOYER_STATE.value_counts(normalize = True)).head(10) * 100 

In [None]:
# Most popular job in a state

most_popular_job_title = data_h1[["EMPLOYER_STATE","JOB_TITLE"]]
most_popular_job_title = most_popular_job_title.groupby(["EMPLOYER_STATE","JOB_TITLE"]).size().reset_index()
most_popular_job_title.columns = ['EMPLOYER_STATE', 'JOB_TITLE', "COUNT"]
test = most_popular_job_title.groupby(['EMPLOYER_STATE', 'JOB_TITLE']).agg({'COUNT':sum})
test = test['COUNT'].groupby(level=0, group_keys=False)
res = test.apply(lambda x: x.sort_values(ascending=False).head(1))
res = pd.DataFrame(res).reset_index()

tot = pd.DataFrame(data_h1["EMPLOYER_STATE"].value_counts()).reset_index()
tot.columns = ["EMPLOYER_STATE","TOTAL"]
res = pd.merge(res, tot, on = "EMPLOYER_STATE", how = "left")
res["PERCENTAGE"] = res["COUNT"] * 100 /res["TOTAL"]

In [None]:
res.head()

In [None]:
# Companies in a particular state Sponsoring

def H1Bcompanies_in_state(state_two_letter, top = None):
    companies_in_ut = data_h1[data_h1["EMPLOYER_STATE"] == state_two_letter][["EMPLOYER_STATE","EMPLOYER_NAME"]]["EMPLOYER_NAME"].value_counts()
#     for c, n in zip(companies_in_ut.index, companies_in_ut):
#         print(c, n)
    if top == None:result = companies_in_ut 
    else:result = companies_in_ut.head(top)
    return result

In [None]:
# Top 10 coompanies in Utah Sponsoring H1-B

H1Bcompanies_in_state("UT", top = 10)

In [None]:
# Top 10 companies in Georgia Sponsoring H1-B

H1Bcompanies_in_state("GA",top = 10)

In [None]:
# Top 10 companies in a state Sponsoring H1-B

H1Bcompanies_in_state("CA",top = 10)

In [None]:
# Levels share

data_h1["PW_WAGE_LEVEL"].value_counts(normalize = True) * 100

In [None]:
# Top 10 H1B job titles in USA

title = pd.DataFrame(data_h1.JOB_TITLE.value_counts()).reset_index()
title["Percentage"] = pd.DataFrame(data_h1.JOB_TITLE.value_counts
                                   (normalize = True) * 100).reset_index()["JOB_TITLE"]
title.columns = ["title","count","perc"]
title.head(10)

In [None]:
# Where does Data Science and related title Stand here?

ds_title = title[title["title"].str.contains("|".join(["DATA SCIE","MACHINE LEAR"]))]
print("Percentage & Count of Data Science and Related Titles : %f, %d"
      %(ds_title["perc"].sum(), ds_title["count"].count()))

In [None]:
# Checking available "PW_UNIT_OF_PAY"

pd.DataFrame(data_h1["PW_UNIT_OF_PAY"].value_counts(normalize = True))

In [None]:
# Creating "ACTUAL_SALARY" variable to normalize all the salaries  

# unitpay_to_num = {"Year":1, "Hour": 2080, "Month": 12, "Bi-Weekly": 24}
# data_h1["MULTIPLIER"] = data_h1["PW_UNIT_OF_PAY"].map(unitpay_to_num)
# data_h1["ACTUAL_SALARY"] = data_h1["WAGE_RATE_OF_PAY_FROM"] * data_h1["MULTIPLIER"]

# retaining only yearly pays
data_h1 = data_h1[data_h1["PW_UNIT_OF_PAY"] == "Year"]

In [None]:
# What are associate data scientist are being paid in US?

ass_ds = data_h1[data_h1["JOB_TITLE"] == "ASSOCIATE DATA SCIENTIST"][["WAGE_RATE_OF_PAY_FROM","PW_UNIT_OF_PAY","EMPLOYER_NAME","EMPLOYER_CITY","EMPLOYER_STATE"]]
ass_ds = ass_ds.reset_index(drop = True)

In [None]:
# Visualization of Associate Data Scientist 

viz = ass_ds.sort_values(by = "WAGE_RATE_OF_PAY_FROM", ascending= False)

In [None]:
#viz = viz.drop("text", axis = 1)

viz["WAGE_RATE_OF_PAY_FROM"] = viz["WAGE_RATE_OF_PAY_FROM"].astype(float)
final_viz = pd.DataFrame(viz.groupby(['EMPLOYER_STATE'])['WAGE_RATE_OF_PAY_FROM'].mean()).reset_index()

In [None]:
# fill missing job titles with "not available"

data_h1["JOB_TITLE"] = data_h1["JOB_TITLE"].fillna("not available")

### Only Data Science related jobs

In [None]:
#function to get data for visualization

def dataviz_job_salary_dist(data, job_title_list, salary_column):
    df_list = [data_h1[data_h1["JOB_TITLE"].str.contains(jt)] for jt in job_title_list]
    result_df = pd.concat(df_list, axis = 1)
    return ( result_df, pd.DataFrame(result_df.groupby(['EMPLOYER_STATE'])[salary_column].mean()).reset_index())

In [None]:
#getting data for map visualization

result_df, all_final_viz = dataviz_job_salary_dist(data_h1[data_h1["NEW_EMPLOYMENT"] == 1], ["DATA SCIENTIST"],"WAGE_RATE_OF_PAY_FROM")

#### PLOTLY

In [None]:
df = all_final_viz
for col in df.columns:
    df[col] = df[col].astype(str)

scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

df['text'] = df['EMPLOYER_STATE'] + '<br>' +'Salary '+ df['WAGE_RATE_OF_PAY_FROM']

data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df['EMPLOYER_STATE'],
        z = df['WAGE_RATE_OF_PAY_FROM'].astype(float),
        locationmode = 'USA-states',
        text = df['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Millions USD")
        ) ]

layout = dict(
        title = 'H1 B Heat Map',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )

In [None]:
# One person details

def lca_single_indv(dataset, employer_name_like, title_like):
    test = data_h1[data_h1["EMPLOYER_NAME"].str.contains(employer_name_like)]
    return test[test["JOB_TITLE"] ==title_like].T

In [None]:
# Status of Application of Change of Employment

data_h1[data_h1["CHANGE_PREVIOUS_EMPLOYMENT"] != 0]["CASE_STATUS"].value_counts(normalize = True) * 100

In [None]:
# Average Salary of a company

def avg_salary_company(employer_name, column):
     return data_h1[(data_h1["EMPLOYER_NAME"] == employer_name)][column].mean()
    
avg_salary_company("OVERSTOCK.COM, INC.","WAGE_RATE_OF_PAY_FROM")