<a href="https://colab.research.google.com/github/sharsulkar/H1B_LCA_outcome_prediction/blob/main/prototyping/notebooks/02_sh_build_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd

In [4]:
observations_df=pd.read_csv('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/reports/preprocessing_steps_observations.csv',sep='$',index_col=0,error_bad_lines=False)

In [5]:
#only pull columns that are required
required_features=list(observations_df[observations_df.preprocess_comment.isin([np.NaN,'Feature engineering','Target feature'])].index)

In [6]:
LCA_df=pd.read_excel('https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/LCA_Disclosure_Data_FY2020_Q2.xlsx',usecols=required_features)

In [7]:
#Drop rows where CASE_STATUS not in ('Certified','Denied')
LCA_df.drop(index=LCA_df[~LCA_df.CASE_STATUS.isin(['Certified','Denied'])].index,inplace=True)
LCA_df.reset_index(inplace=True)

### Feature engineering

In [8]:
#feature engineering steps
def date_diff(date1,date2):
  #error handling - 
    #validate input datatype is datetime
    #handle empty inputs
  return date1-date2

#### Processing_Days and Validity_days

In [9]:
LCA_df['PROCESSING_DAYS']=date_diff(LCA_df.DECISION_DATE, LCA_df.RECEIVED_DATE)
LCA_df['VALIDITY_DAYS']=date_diff(LCA_df.END_DATE, LCA_df.BEGIN_DATE)

#### SOC_Codes

In [10]:
LCA_df['SOC_CD2']=LCA_df.SOC_CODE.str.split(pat='-',n=1,expand=True)[0]
LCA_df['SOC_CD4']=LCA_df.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[0]
LCA_df['SOC_CD_ONET']=LCA_df.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[1]

#### USA_YN

In [11]:
def is_USA(country):
  if country=='UNITED STATES OF AMERICA':
    USA_YN='Y' 
  else:
    USA_YN='N'
  return USA_YN

In [12]:
LCA_df['USA_YN']=LCA_df.EMPLOYER_COUNTRY.apply(is_USA)

#### Employer_Worksite_YN

In [13]:
LCA_df['EMPLOYER_WORKSITE_YN']='Y'
LCA_df.loc[LCA_df.EMPLOYER_POSTAL_CODE.ne(LCA_df.WORKSITE_POSTAL_CODE),'EMPLOYER_WORKSITE_YN']='N'

#### OES_YN

In [14]:
LCA_df['OES_YN']='Y'
LCA_df.iloc[LCA_df[~LCA_df.PW_OTHER_SOURCE.isna()].index,LCA_df.columns.get_loc('OES_YN')]='N'

#### SURVEY_YEAR

In [15]:
#reference https://www.interviewqs.com/ddi-code-snippets/extract-month-year-pandas
#Compute and move all 'Year' values from PW_OES_YEAR field into SURVEY_YEAR
LCA_df['SURVEY_YEAR']=pd.to_datetime(LCA_df.PW_OES_YEAR.str.split(pat='-',n=1,expand=True)[0]).dt.to_period('Y')
#Then create a series object to store only the 'YEAR' values from PW_OTHER_YEAR when OES_YN==N
PW_other_year=LCA_df[LCA_df.OES_YN=='N'].PW_OTHER_YEAR
#Rename the series and update dataframe with series object
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html
PW_other_year.rename("SURVEY_YEAR",inplace=True)
LCA_df.update(PW_other_year)

#### WAGE_ABOVE_PREVAILING_HR

In [16]:
#initialize with WAGE_RATE_OF_PAY_FROM
LCA_df['WAGE_PER_HR']=LCA_df.WAGE_RATE_OF_PAY_FROM
#compute for Year
LCA_df.iloc[LCA_df[LCA_df.WAGE_UNIT_OF_PAY=='Year'].index,LCA_df.columns.get_loc('WAGE_PER_HR')]=LCA_df[LCA_df.WAGE_UNIT_OF_PAY=='Year'].WAGE_RATE_OF_PAY_FROM/2067
#compute for Month
LCA_df.iloc[LCA_df[LCA_df.WAGE_UNIT_OF_PAY=='Month'].index,LCA_df.columns.get_loc('WAGE_PER_HR')]=LCA_df[LCA_df.WAGE_UNIT_OF_PAY=='Month'].WAGE_RATE_OF_PAY_FROM/172

In [17]:
#initialize with WAGE_RATE_OF_PAY_FROM
LCA_df['PW_WAGE_PER_HR']=LCA_df.PREVAILING_WAGE
#compute for Year
LCA_df.iloc[LCA_df[LCA_df.PW_UNIT_OF_PAY=='Year'].index,LCA_df.columns.get_loc('PW_WAGE_PER_HR')]=LCA_df[LCA_df.PW_UNIT_OF_PAY=='Year'].PREVAILING_WAGE/2067
#compute for Month
LCA_df.iloc[LCA_df[LCA_df.PW_UNIT_OF_PAY=='Month'].index,LCA_df.columns.get_loc('PW_WAGE_PER_HR')]=LCA_df[LCA_df.PW_UNIT_OF_PAY=='Month'].PREVAILING_WAGE/172

In [18]:
LCA_df['WAGE_ABOVE_PW_HR']=LCA_df.WAGE_PER_HR-LCA_df.PW_WAGE_PER_HR

#### Drop columns after feature engineering

In [None]:
drop_columns=['DECISION_DATE','RECEIVED_DATE','END_DATE','BEGIN_DATE','SOC_CODE','EMPLOYER_COUNTRY','EMPLOYER_POSTAL_CODE','WORKSITE_POSTAL_CODE','PW_OTHER_SOURCE','PW_OES_YEAR','PW_OTHER_YEAR','WAGE_RATE_OF_PAY_FROM','WAGE_UNIT_OF_PAY','WAGE_PER_HR','PW_WAGE_PER_HR','PREVAILING_WAGE','PW_UNIT_OF_PAY','index']
LCA_df.drop(columns=drop_columns,inplace=True)

In [26]:
#embed categorical features


In [None]:
#scale numerical features
observations_df.loc[LCA_df.columns.values]

In [None]:
#separate target column and convert to binary


In [None]:
LCA_df.head()

In [62]:
observations_df.loc[observations_df.preprocess_action.isna(),'preprocess_comment']='Use feature'

In [63]:
observations_df.loc[observations_df.preprocess_action.isna()]

Unnamed: 0,Dtype,percent_missing,cardinality,preprocess_action,preprocess_comment,new_feature_name,new_feature_logic,Categorical class,embedding
VISA_CLASS,object,0.0,99.997455,,Use feature,,,Categorical,Standardized random
SOC_TITLE,object,0.0,99.596623,,Use feature,,,Categorical,Standardized random
FULL_TIME_POSITION,object,0.0,99.998728,,Use feature,,,Binary,Standardized random
TOTAL_WORKER_POSITIONS,int64,0.0,99.972642,,Use feature,,,,
NEW_EMPLOYMENT,int64,0.0,99.973914,,Use feature,,,,
CONTINUED_EMPLOYMENT,int64,0.0,99.988548,,Use feature,,,,
CHANGE_PREVIOUS_EMPLOYMENT,int64,0.0,99.992365,,Use feature,,,,
NEW_CONCURRENT_EMPLOYMENT,int64,0.0,99.996183,,Use feature,,,,
CHANGE_EMPLOYER,int64,0.0,99.991093,,Use feature,,,,
AMENDED_PETITION,int64,0.0,99.992365,,Use feature,,,,
