<a href="https://colab.research.google.com/github/sharsulkar/H1B_LCA_outcome_prediction/blob/main/prototyping/notebooks/02_sh_build_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [115]:
import numpy as np
import pandas as pd

In [182]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [116]:
observations_df=pd.read_csv('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/reports/preprocessing_steps_observations.csv',sep='$',index_col=0,error_bad_lines=False)

In [117]:
#only pull columns that are required
required_features=list(observations_df[observations_df.preprocess_comment.isin([np.NaN,'Feature engineering','Target feature','Use feature as is'])].index)

In [118]:
LCA_df=pd.read_excel('https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/LCA_Disclosure_Data_FY2020_Q2.xlsx',usecols=required_features)

In [119]:
#Drop rows where CASE_STATUS not in ('Certified','Denied')
LCA_df.drop(index=LCA_df[~LCA_df.CASE_STATUS.isin(['Certified','Denied'])].index,inplace=True)
LCA_df.reset_index(inplace=True)

In [120]:
def modify_observations(df,index,columns,values,modify_action='update_values'):
  #assert - index, columns and values are string list type, 
  #columns and values are same size, for single column - value should be scalar
  #columns that have modification exist in observation_df
  #
  if modify_action=='add_row':
    df.loc[index]=values

  elif modify_action=='update_values':
    df.loc[index,columns]=values
  
  return df

In [121]:
def missing_statistics(df,column):
  return (df.shape[0]-df[column].count())*100/df.shape[0]

In [122]:
def cardinality_statistics(df,column):
  return (df.shape[0]-len(df[column].unique()))*100/df.shape[0]

### Feature engineering

In [123]:
#feature engineering steps
def date_diff(date1,date2):
  #error handling - 
    #validate input datatype is datetime
    #handle empty inputs
  return date1-date2

#### Processing_Days and Validity_days

In [124]:
#Compute feature
LCA_df['PROCESSING_DAYS']=date_diff(LCA_df.DECISION_DATE, LCA_df.RECEIVED_DATE)
LCA_df['VALIDITY_DAYS']=date_diff(LCA_df.END_DATE, LCA_df.BEGIN_DATE)

In [128]:
#update observations_df with new feature details
observations_df=modify_observations(observations_df,index='PROCESSING_DAYS',columns=[],values=[LCA_df.PROCESSING_DAYS.dtype,missing_statistics(LCA_df,'PROCESSING_DAYS'),cardinality_statistics(LCA_df,'PROCESSING_DAYS'),'New feature','Feature engineering','','','Ordinal','Ordered standardized random'],modify_action='add_row')
observations_df=modify_observations(observations_df,index='VALIDITY_DAYS',columns=[],values=[LCA_df.VALIDITY_DAYS.dtype,missing_statistics(LCA_df,'VALIDITY_DAYS'),cardinality_statistics(LCA_df,'VALIDITY_DAYS'),'New feature','Feature engineering','','','Ordinal','Ordered standardized random'],modify_action='add_row')

#### SOC_Codes

In [130]:
#Compute feature
LCA_df['SOC_CD2']=LCA_df.SOC_CODE.str.split(pat='-',n=1,expand=True)[0]
LCA_df['SOC_CD4']=LCA_df.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[0]
LCA_df['SOC_CD_ONET']=LCA_df.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[1]

In [131]:
#update observations_df with new feature details
observations_df=modify_observations(observations_df,index='SOC_CD2',columns=[],values=[LCA_df.SOC_CD2.dtype,missing_statistics(LCA_df,'SOC_CD2'),cardinality_statistics(LCA_df,'SOC_CD2'),'New feature','Feature engineering','','','Categorical','Standardized random'],modify_action='add_row')
observations_df=modify_observations(observations_df,index='SOC_CD4',columns=[],values=[LCA_df.SOC_CD4.dtype,missing_statistics(LCA_df,'SOC_CD4'),cardinality_statistics(LCA_df,'SOC_CD4'),'New feature','Feature engineering','','','Categorical','Standardized random'],modify_action='add_row')
observations_df=modify_observations(observations_df,index='SOC_CD_ONET',columns=[],values=[LCA_df.SOC_CD_ONET.dtype,missing_statistics(LCA_df,'SOC_CD_ONET'),cardinality_statistics(LCA_df,'SOC_CD_ONET'),'New feature','Feature engineering','','','Categorical','Standardized random'],modify_action='add_row')

#### USA_YN

In [132]:
def is_USA(country):
  if country=='UNITED STATES OF AMERICA':
    USA_YN='Y' 
  else:
    USA_YN='N'
  return USA_YN

In [133]:
#Compute feature
LCA_df['USA_YN']=LCA_df.EMPLOYER_COUNTRY.apply(is_USA)

In [135]:
#update observations_df with new feature details
observations_df=modify_observations(observations_df,index='USA_YN',columns=[],values=[LCA_df.USA_YN.dtype,missing_statistics(LCA_df,'USA_YN'),cardinality_statistics(LCA_df,'USA_YN'),'New feature','Feature engineering','','','Binary','Standardized random'],modify_action='add_row')

#### Employer_Worksite_YN

In [136]:
#Compute feature
LCA_df['EMPLOYER_WORKSITE_YN']='Y'
LCA_df.loc[LCA_df.EMPLOYER_POSTAL_CODE.ne(LCA_df.WORKSITE_POSTAL_CODE),'EMPLOYER_WORKSITE_YN']='N'

In [138]:
#update observations_df with new feature details
observations_df=modify_observations(observations_df,index='EMPLOYER_WORKSITE_YN',columns=[],values=[LCA_df.EMPLOYER_WORKSITE_YN.dtype,missing_statistics(LCA_df,'EMPLOYER_WORKSITE_YN'),cardinality_statistics(LCA_df,'EMPLOYER_WORKSITE_YN'),'New feature','Feature engineering','','','Binary','Standardized random'],modify_action='add_row')

#### OES_YN

In [139]:
#Compute feature
LCA_df['OES_YN']='Y'
LCA_df.iloc[LCA_df[~LCA_df.PW_OTHER_SOURCE.isna()].index,LCA_df.columns.get_loc('OES_YN')]='N'

In [140]:
#update observations_df with new feature details
observations_df=modify_observations(observations_df,index='OES_YN',columns=[],values=[LCA_df.OES_YN.dtype,missing_statistics(LCA_df,'OES_YN'),cardinality_statistics(LCA_df,'OES_YN'),'New feature','Feature engineering','','','Binary','Standardized random'],modify_action='add_row')

#### SURVEY_YEAR

In [141]:
#Compute feature
#reference https://www.interviewqs.com/ddi-code-snippets/extract-month-year-pandas
#Compute and move all 'Year' values from PW_OES_YEAR field into SURVEY_YEAR
LCA_df['SURVEY_YEAR']=pd.to_datetime(LCA_df.PW_OES_YEAR.str.split(pat='-',n=1,expand=True)[0]).dt.to_period('Y')
#Then create a series object to store only the 'YEAR' values from PW_OTHER_YEAR when OES_YN==N
PW_other_year=LCA_df[LCA_df.OES_YN=='N'].PW_OTHER_YEAR
#Rename the series and update dataframe with series object
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html
PW_other_year.rename("SURVEY_YEAR",inplace=True)
LCA_df.update(PW_other_year)

In [142]:
#update observations_df with new feature details
observations_df=modify_observations(observations_df,index='SURVEY_YEAR',columns=[],values=[LCA_df.SURVEY_YEAR.dtype,missing_statistics(LCA_df,'SURVEY_YEAR'),cardinality_statistics(LCA_df,'SURVEY_YEAR'),'New feature','Feature engineering','','','Ordinal','Standardized random'],modify_action='add_row')

#### WAGE_ABOVE_PREVAILING_HR

In [143]:
#Compute feature
#initialize with WAGE_RATE_OF_PAY_FROM
LCA_df['WAGE_PER_HR']=LCA_df.WAGE_RATE_OF_PAY_FROM
#compute for Year
LCA_df.iloc[LCA_df[LCA_df.WAGE_UNIT_OF_PAY=='Year'].index,LCA_df.columns.get_loc('WAGE_PER_HR')]=LCA_df[LCA_df.WAGE_UNIT_OF_PAY=='Year'].WAGE_RATE_OF_PAY_FROM/2067
#compute for Month
LCA_df.iloc[LCA_df[LCA_df.WAGE_UNIT_OF_PAY=='Month'].index,LCA_df.columns.get_loc('WAGE_PER_HR')]=LCA_df[LCA_df.WAGE_UNIT_OF_PAY=='Month'].WAGE_RATE_OF_PAY_FROM/172

In [144]:
#Compute feature
#initialize with WAGE_RATE_OF_PAY_FROM
LCA_df['PW_WAGE_PER_HR']=LCA_df.PREVAILING_WAGE
#compute for Year
LCA_df.iloc[LCA_df[LCA_df.PW_UNIT_OF_PAY=='Year'].index,LCA_df.columns.get_loc('PW_WAGE_PER_HR')]=LCA_df[LCA_df.PW_UNIT_OF_PAY=='Year'].PREVAILING_WAGE/2067
#compute for Month
LCA_df.iloc[LCA_df[LCA_df.PW_UNIT_OF_PAY=='Month'].index,LCA_df.columns.get_loc('PW_WAGE_PER_HR')]=LCA_df[LCA_df.PW_UNIT_OF_PAY=='Month'].PREVAILING_WAGE/172

In [145]:
LCA_df['WAGE_ABOVE_PW_HR']=LCA_df.WAGE_PER_HR-LCA_df.PW_WAGE_PER_HR

In [146]:
#update observations_df with new feature details
observations_df=modify_observations(observations_df,index='WAGE_ABOVE_PW_HR',columns=[],values=[LCA_df.WAGE_ABOVE_PW_HR.dtype,missing_statistics(LCA_df,'WAGE_ABOVE_PW_HR'),cardinality_statistics(LCA_df,'WAGE_ABOVE_PW_HR'),'New feature','Feature engineering','','','Numerical','Standard scaling'],modify_action='add_row')

#### Drop columns after feature engineering

In [147]:
drop_columns=['DECISION_DATE','RECEIVED_DATE','END_DATE','BEGIN_DATE','SOC_CODE','EMPLOYER_COUNTRY','EMPLOYER_POSTAL_CODE','WORKSITE_POSTAL_CODE','PW_OTHER_SOURCE','PW_OES_YEAR','PW_OTHER_YEAR','WAGE_RATE_OF_PAY_FROM','WAGE_UNIT_OF_PAY','WAGE_PER_HR','PW_WAGE_PER_HR','PREVAILING_WAGE','PW_UNIT_OF_PAY','index']
LCA_df.drop(columns=drop_columns,inplace=True)

### Encode finalized features

#### Update preprocess action for any features that will be used as is (pre encoding)

In [None]:
observations_df[observations_df.preprocess_action.isna()]

In [149]:
index=['VISA_CLASS', 'SOC_TITLE', 'FULL_TIME_POSITION','TOTAL_WORKER_POSITIONS', 'NEW_EMPLOYMENT', 'CONTINUED_EMPLOYMENT','CHANGE_PREVIOUS_EMPLOYMENT', 'NEW_CONCURRENT_EMPLOYMENT','CHANGE_EMPLOYER', 'AMENDED_PETITION', 'EMPLOYER_NAME', 'NAICS_CODE','AGENT_REPRESENTING_EMPLOYER', 'WORKSITE_WORKERS', 'SECONDARY_ENTITY','PW_WAGE_LEVEL', 'TOTAL_WORKSITE_LOCATIONS', 'AGREE_TO_LC_STATEMENT','H-1B_DEPENDENT', 'WILLFUL_VIOLATOR', 'PUBLIC_DISCLOSURE']
columns=['preprocess_action']
values='Use feature as is'
observations_df=modify_observations(observations_df,index,columns,values,modify_action='update_values')

#### Update Observations_df with Categorical class and encoding plan for any features that might have been missed earlier

In [None]:
observations_df[(observations_df['Categorical class'].isna()) & (observations_df.preprocess_action!='Drop column')]

In [151]:
index=['TOTAL_WORKER_POSITIONS', 'NEW_EMPLOYMENT', 'CONTINUED_EMPLOYMENT','CHANGE_PREVIOUS_EMPLOYMENT', 'NEW_CONCURRENT_EMPLOYMENT','CHANGE_EMPLOYER', 'AMENDED_PETITION', 'WORKSITE_WORKERS','TOTAL_WORKSITE_LOCATIONS']
columns=['Categorical class','embedding']
values=['Numerical','Standard scaling']
observations_df=modify_observations(observations_df,index,columns,values,modify_action='update_values')

### Build Pipeline

#### Categorical

In [207]:
from sklearn.impute import SimpleImputer

In [193]:
from sklearn.base import BaseEstimator, TransformerMixin

In [194]:
#Custom transformer to compute Random Standard encoding
class RSE_Transformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self, cat_cols ):
        self.cat_cols = cat_cols # list of categorical columns in input Dataframe
        self.categories = None # Array of unique non-numeric values in each categorical column
        self.RSE = None # Array of Random Standard encoding for each row in categories
        
    #Return self, nothing else to do here
    def fit( self, X, y=None ):
      #identify categorical columns
      #self.cat_cols=list(X.select_dtypes('O').columns)
      #Get a list of all unique categorical values for each column
      self.categories = [X[column].unique() for column in X[self.cat_cols]]
      #replace missing values and append missing value label to each column to handle missing values in test dataset that might not be empty in train dataset
      for i in range(len(self.categories)):
        if np.array(self.categories[i].astype(str)!=str(np.nan)).all():
          self.categories[i]=np.append(self.categories[i],np.nan)
      #compute RandomStandardEncoding 
      self.RSE=[np.random.normal(0,1,len(self.categories[i])) for i in range(len(self.cat_cols))]
      return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y=None):
      for i in range(len(self.cat_cols)):
        #X[str(self.cat_cols[i])].replace(dict(zip(self.categories[i], self.RSE[i])),inplace=True)
        X.loc[:,(str(self.cat_cols[i]))].replace(dict(zip(self.categories[i], self.RSE[i])),inplace=True)
      return X    

In [199]:
#embed categorical features
cat_cols=observations_df[(observations_df['Categorical class'].isin(['Categorical','Ordinal','Binary'])) & (observations_df.preprocess_action!='Drop column')].index.values
col_imputer=SimpleImputer(strategy='constant',fill_value='missing')
rse=RSE_Transformer(cat_cols[0:4])
rse.fit(LCA_df[cat_cols[0:4]])

RSE_Transformer(cat_cols=array(['CASE_STATUS', 'VISA_CLASS', 'SOC_TITLE', 'FULL_TIME_POSITION'],
      dtype=object))

In [None]:
rse.categories

#### Ordinal

#### Binary

#### Numerical

In [179]:
#scale numerical features
numeric_cols=observations_df[(observations_df['Categorical class']=='Numerical') & (observations_df.preprocess_action!='Drop column')].index.values
num_imputer=SimpleImputer(strategy='mean')
std=StandardScaler()
std.fit(LCA_df[numeric_cols])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [None]:
#separate target column and convert to binary
