<a href="https://colab.research.google.com/github/sharsulkar/H1B_LCA_outcome_prediction/blob/main/prototyping/notebooks/03_sh_build_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer

In [3]:
observations_df=pd.read_csv('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/reports/final_observations.csv',sep='$',index_col=0,error_bad_lines=False)

In [None]:
def read_csv_to_list(filepath,header=None,squeeze=True):
  return list(pd.read_csv(filepath,header=None,squeeze=True))

In [59]:
#required_features=list(observations_df[(observations_df.preprocess_comment.isin([np.NaN,'Feature engineering','Target feature','Use feature as is'])) & (~observations_df.preprocess_action.isin(['New feature']))].index)
required_features=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/required_features.csv',header=None,squeeze=True)

In [60]:

LCA_df=pd.read_excel('https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/LCA_Disclosure_Data_FY2020_Q2.xlsx',usecols=required_features)

In [61]:
LCA_dfcopy=LCA_df.copy()

In [62]:
#Custom transformer to drop rows based on filter
class droprows_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, row_index, inplace, reset_index):
      self.row_index = row_index # row index to drop
      self.inplace=True
      self.reset_index=True

    def fit( self, X, y=None):
      return self 
    
    def transform(self, X, y=None):
      X.drop(index=self.row_index,inplace=self.inplace)
      if self.reset_index:
        X.reset_index(inplace=True)
      return X

In [None]:
drop_row_index=LCA_df[~LCA_df.CASE_STATUS.isin(['Certified','Denied'])].index
dr=droprows_Transformer(row_index=drop_row_index,inplace=True,reset_index=True)
dr.transform(LCA_df)

In [64]:
#Separate target column
y=LCA_df.pop('CASE_STATUS')

In [65]:
class buildfeatures_Transformer(BaseEstimator, TransformerMixin):
  def __init__(self, input_columns):
    self.input_columns=input_columns

  def date_diff(self,date1,date2):
    return date1-date2

  def is_USA(self,country):
    if country=='UNITED STATES OF AMERICA':
      USA_YN='Y' 
    else:
      USA_YN='N'
    return USA_YN

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    # Processing_Days and Validity_days
    X['PROCESSING_DAYS']=self.date_diff(X.DECISION_DATE, X.RECEIVED_DATE).dt.days
    X['VALIDITY_DAYS']=self.date_diff(X.END_DATE, X.BEGIN_DATE).dt.days

    # SOC_Codes
    X['SOC_CD2']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[0]
    X['SOC_CD4']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[0]
    X['SOC_CD_ONET']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[1]

    # USA_YN
    X['USA_YN']=X.EMPLOYER_COUNTRY.apply(self.is_USA)

    # Employer_Worksite_YN
    X['EMPLOYER_WORKSITE_YN']='Y'
    X.loc[X.EMPLOYER_POSTAL_CODE.ne(X.WORKSITE_POSTAL_CODE),'EMPLOYER_WORKSITE_YN']='N'

    # OES_YN
    X['OES_YN']='Y'
    X.iloc[LCA_df[~X.PW_OTHER_SOURCE.isna()].index,X.columns.get_loc('OES_YN')]='N'

    # SURVEY_YEAR
    X['SURVEY_YEAR']=pd.to_datetime(X.PW_OES_YEAR.str.split(pat='-',n=1,expand=True)[0]).dt.to_period('Y')
    PW_other_year=X[X.OES_YN=='N'].PW_OTHER_YEAR
    #Rename the series and update dataframe with series object
    PW_other_year.rename("SURVEY_YEAR",inplace=True)
    X.update(PW_other_year)

    # WAGE_ABOVE_PREVAILING_HR
    X['WAGE_PER_HR']=X.WAGE_RATE_OF_PAY_FROM
    #compute for Year
    X.iloc[X[X.WAGE_UNIT_OF_PAY=='Year'].index,X.columns.get_loc('WAGE_PER_HR')]=X[X.WAGE_UNIT_OF_PAY=='Year'].WAGE_RATE_OF_PAY_FROM/2067
    #compute for Month
    X.iloc[X[X.WAGE_UNIT_OF_PAY=='Month'].index,X.columns.get_loc('WAGE_PER_HR')]=X[X.WAGE_UNIT_OF_PAY=='Month'].WAGE_RATE_OF_PAY_FROM/172

    #initialize with WAGE_RATE_OF_PAY_FROM
    X['PW_WAGE_PER_HR']=X.PREVAILING_WAGE
    #compute for Year
    X.iloc[X[X.PW_UNIT_OF_PAY=='Year'].index,X.columns.get_loc('PW_WAGE_PER_HR')]=X[X.PW_UNIT_OF_PAY=='Year'].PREVAILING_WAGE/2067
    #compute for Month
    X.iloc[X[X.PW_UNIT_OF_PAY=='Month'].index,X.columns.get_loc('PW_WAGE_PER_HR')]=X[X.PW_UNIT_OF_PAY=='Month'].PREVAILING_WAGE/172

    X['WAGE_ABOVE_PW_HR']=X.WAGE_PER_HR-X.PW_WAGE_PER_HR

    return X

In [66]:
#fe_cols=list(observations_df[(observations_df.preprocess_comment.isin(['Feature engineering'])) & (~observations_df.preprocess_action.isin(['New feature']))].index)
fe_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/feature_engineering_columns.csv',header=None,squeeze=True)

In [67]:

bf=buildfeatures_Transformer(fe_cols)
fe_df=bf.transform(LCA_df)

In [68]:
#Custom transformer to drop features for input feature list
class dropfeatures_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, inplace):
      self.columns = columns # list of categorical columns in input Dataframe
      self.inplace=True

    def fit( self, X, y=None):
      return self 
    
    def transform(self, X, y=None):
      X.drop(columns=self.columns,inplace=self.inplace)
      return X

In [83]:
#drop_cols=set(LCA_df.columns.values)-set(observations_df[observations_df.preprocess_action.isin(['New feature','Use feature as is'])].index.values)
drop_cols=read_csv_to_list('https://github.com/sharsulkar/H1B_LCA_outcome_prediction/raw/main/data/processed/drop_columns.csv',header=None,squeeze=True)

In [None]:

df=dropfeatures_Transformer(columns=list(drop_cols),inplace=True)
df.transform(fe_df)

In [88]:
#Custom transformer to compute Random Standard encoding
#add option to return ordered encoding, whether to include encoding for missing value or not
class RSE_Transformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self, cat_cols, categories=None, RSE=None ):
        self.cat_cols = cat_cols # list of categorical columns in input Dataframe
        self.categories = categories # Array of unique non-numeric values in each categorical column
        self.RSE = RSE # Array of Random Standard encoding for each row in categories
        
    #Return self, nothing else to do here
    def fit( self, X, y=None ):
      #identify categorical columns
      #self.cat_cols=list(X.select_dtypes('O').columns)
      #Get a list of all unique categorical values for each column
      self.categories = [X[column].unique() for column in X[self.cat_cols]]
      #replace missing values and append missing value label to each column to handle missing values in test dataset that might not be empty in train dataset
      for i in range(len(self.categories)):
        if np.array(self.categories[i].astype(str)!=str(np.nan)).all():
          self.categories[i]=np.append(self.categories[i],np.nan)
      #compute RandomStandardEncoding 
      self.RSE=[np.random.normal(0,1,len(self.categories[i])) for i in range(len(self.cat_cols))]
      return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y=None):
      for i in range(len(self.cat_cols)):
        #X[str(self.cat_cols[i])].replace(dict(zip(self.categories[i], self.RSE[i])),inplace=True)
        X.loc[:,(str(self.cat_cols[i]))].replace(dict(zip(self.categories[i], self.RSE[i])),inplace=True)
      return X    

In [89]:
#cat_cols=observations_df[(observations_df['Categorical class'].isin(['Categorical','Ordinal','Binary'])) & (observations_df.preprocess_action!='Drop column') & (observations_df.preprocess_comment!='Target feature')].index.values
cat_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/categorical_columns.csv',header=None,squeeze=True)

In [90]:
#embed categorical features

#col_imputer=SimpleImputer(strategy='constant',fill_value='missing')
rse=RSE_Transformer(cat_cols)
rse.fit_transform(fe_df[cat_cols])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


Unnamed: 0,VISA_CLASS,SOC_TITLE,FULL_TIME_POSITION,EMPLOYER_NAME,NAICS_CODE,AGENT_REPRESENTING_EMPLOYER,SECONDARY_ENTITY,PW_WAGE_LEVEL,AGREE_TO_LC_STATEMENT,H-1B_DEPENDENT,WILLFUL_VIOLATOR,PUBLIC_DISCLOSURE,SOC_CD2,SOC_CD4,SOC_CD_ONET,USA_YN,EMPLOYER_WORKSITE_YN,OES_YN,SURVEY_YEAR
0,0.496714,-0.234137,-1.654857,-1.289961,-0.435474,-1.680779,1.006235,-0.064766,1.210252,-0.980279,-0.082030,-1.430941,-0.287306,-1.221575,0.888044,-1.636841,-0.905419,1.113014,0.767372
1,0.496714,1.579213,-1.654857,-1.295079,-0.435474,-1.241483,0.684904,-0.406707,1.210252,-0.010362,-0.082030,-1.430941,-0.189568,2.206012,0.888044,-1.636841,-1.520694,2.120143,0.833171
2,0.496714,-0.234137,-1.654857,-1.289961,-0.435474,-1.680779,1.006235,-0.064766,1.210252,-0.980279,-0.082030,-1.430941,-0.287306,-1.221575,0.888044,-1.636841,-0.905419,1.113014,0.767372
3,0.496714,-0.234137,-1.654857,-1.289961,-0.435474,-1.680779,1.006235,-0.064766,1.210252,-0.980279,-0.082030,-1.430941,-0.287306,-1.221575,0.888044,-1.636841,-0.905419,1.113014,0.767372
4,0.496714,0.767435,-1.654857,-0.335785,-1.174054,-1.241483,1.006235,0.288895,1.210252,-0.010362,-0.082030,-1.430941,-0.287306,-1.053173,0.888044,-1.636841,-0.905419,1.113014,0.767372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151192,0.647689,-0.115648,-1.654857,-1.535572,1.139629,-1.680779,0.684904,0.004862,1.210252,0.105972,0.805302,-1.430941,0.385205,0.674299,0.888044,-1.636841,-0.905419,1.113014,0.767372
151193,0.647689,-1.913280,-1.654857,-0.460187,2.642910,-1.241483,0.684904,0.288895,1.210252,0.105972,0.805302,-1.430941,-0.287306,0.522203,0.888044,-1.636841,-0.905419,1.113014,0.767372
151194,0.647689,1.909417,0.823171,-1.935430,-0.501129,-1.680779,0.684904,0.288895,1.210252,0.105972,0.805302,-1.430941,-0.189568,-0.569792,0.888044,-1.636841,-1.520694,1.113014,0.767372
151195,0.647689,-0.601707,0.823171,-1.935430,-0.501129,-1.680779,0.684904,0.004862,1.210252,0.105972,0.805302,-1.430941,-0.060016,0.187683,0.888044,-1.636841,-1.520694,1.113014,0.767372


In [91]:
#num_cols=observations_df[(observations_df['Categorical class']=='Numerical') & (observations_df.preprocess_action!='Drop column')].index.values
num_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/numeric_columns.csv',header=None,squeeze=True)

In [92]:
#scale numerical features

num_imputer=SimpleImputer(strategy='mean')
std=StandardScaler()
X=std.fit_transform(fe_df[num_cols])

In [93]:
LCA_df=LCA_dfcopy.copy()

In [94]:
#Build preprocessing pipeline
build_feature_pipe=make_pipeline(
    droprows_Transformer(row_index=drop_row_index,inplace=True,reset_index=True),
    buildfeatures_Transformer(fe_cols)
    )

numerical_preprocess=make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)
preprocess_pipe=make_column_transformer(
    (dropfeatures_Transformer(columns=drop_cols,inplace=True),drop_cols),
    (RSE_Transformer(cat_cols),cat_cols),
    (numerical_preprocess,num_cols),
    remainder='passthrough'
)
all_preprocess=make_pipeline(
    preprocess_pipe
)

In [95]:
#apply pipeline
#feature engineering + drop rows
fe_df=build_feature_pipe.fit_transform(LCA_df)
#Separate target column - add conditions to apply only on training dataset
y=fe_df.pop('CASE_STATUS')
#drop columns + encoding
X=all_preprocess.fit_transform(fe_df)

In [None]:
#save transformed dataset and target
pd.DataFrame(X,columns=fe_df.columns.values).to_csv('/content/drive/MyDrive/Datasets/processed.csv')

In [None]:
from pickle import dump, load

In [None]:
#save pipeline
#reference - https://machinelearningmastery.com/how-to-save-and-load-models-and-data-preparation-in-scikit-learn-for-later-use/
dump(all_preprocess,open('/content/drive/MyDrive/preprocess_pipe.pkl','wb'))
#all_preprocess=load(open('/content/drive/MyDrive/preprocess_pipe.pkl','rb'))