<a href="https://colab.research.google.com/github/sharsulkar/H1B_LCA_outcome_prediction/blob/main/prototyping/notebooks/03_sh_build_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer

In [2]:
observations_df=pd.read_csv('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/reports/final_observations.csv',sep='$',index_col=0,error_bad_lines=False)

In [3]:
def read_csv_to_list(filepath,header=None,squeeze=True):
  return list(pd.read_csv(filepath,header=None,squeeze=True))

In [None]:
#required_features=list(observations_df[(observations_df.preprocess_comment.isin([np.NaN,'Feature engineering','Target feature','Use feature as is'])) & (~observations_df.preprocess_action.isin(['New feature']))].index)
required_features=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/required_features.csv',header=None,squeeze=True)
LCA_df=pd.read_excel('https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/LCA_Disclosure_Data_FY2020_Q1.xlsx',usecols=required_features)
LCA_dfcopy=LCA_df.copy()

In [4]:
required_features=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/required_features.csv',header=None,squeeze=True)
data_files_list_path='/content/drive/MyDrive/Datasets/LCA_files_list.txt'
#create an empty dataframe to hold the final concatenated result
input_df=pd.DataFrame(columns=required_features)

#define the file object 
file_itr=open(file=data_files_list_path,mode='r')

#iterate through the file and append the data to input_df
for path in file_itr:
  data_df=pd.read_excel(path,usecols=required_features)
  input_df=input_df.append(data_df,ignore_index=True)    

file_itr.close

<function TextIOWrapper.close>

In [5]:
LCA_df=input_df.copy()
LCA_df.shape

(269190, 37)

In [52]:
#Custom transformer to drop rows based on filter
class droprows_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, row_index, inplace, reset_index):
      self.row_index = row_index # row index to drop
      self.inplace=True
      self.reset_index=True

    def fit( self, X, y=None):
      return self 
    
    def transform(self, X, y=None):
      X.drop(index=self.row_index,inplace=self.inplace)
      if self.reset_index:
        X.reset_index(inplace=True,drop=True)
      return X

In [7]:
drop_row_index=LCA_df[~LCA_df.CASE_STATUS.isin(['Certified','Denied'])].index


In [None]:
dr=droprows_Transformer(row_index=drop_row_index,inplace=True,reset_index=True)
dr.transform(LCA_df)

In [9]:
#Separate target column
y=LCA_df.pop('CASE_STATUS')

In [58]:
class buildfeatures_Transformer(BaseEstimator, TransformerMixin):
  def __init__(self, input_columns):
    self.input_columns=input_columns

  def date_diff(self,date1,date2):
    return date1-date2

  def is_USA(self,country):
    if country=='UNITED STATES OF AMERICA':
      USA_YN='Y' 
    else:
      USA_YN='N'
    return USA_YN

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    # Processing_Days and Validity_days
    X['PROCESSING_DAYS']=self.date_diff(X.DECISION_DATE, X.RECEIVED_DATE).dt.days
    X['VALIDITY_DAYS']=self.date_diff(X.END_DATE, X.BEGIN_DATE).dt.days

    # SOC_Codes
    X['SOC_CD2']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[0]
    X['SOC_CD4']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[0]
    X['SOC_CD_ONET']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[1]

    # USA_YN
    X['USA_YN']=X.EMPLOYER_COUNTRY.apply(self.is_USA)

    # Employer_Worksite_YN
    X['EMPLOYER_WORKSITE_YN']='Y'
    X.loc[X.EMPLOYER_POSTAL_CODE.ne(X.WORKSITE_POSTAL_CODE),'EMPLOYER_WORKSITE_YN']='N'

    # OES_YN
    X['OES_YN']='Y'
    X.iloc[X[~X.PW_OTHER_SOURCE.isna()].index,X.columns.get_loc('OES_YN')]='N'

    # SURVEY_YEAR
    X['SURVEY_YEAR']=pd.to_datetime(X.PW_OES_YEAR.str.split(pat='-',n=1,expand=True)[0]).dt.to_period('Y')
    PW_other_year=X[X.OES_YN=='N'].PW_OTHER_YEAR
    #Rename the series and update dataframe with series object
    PW_other_year.rename("SURVEY_YEAR",inplace=True)
    X.update(PW_other_year)

    # WAGE_ABOVE_PREVAILING_HR
    X['WAGE_PER_HR']=X.WAGE_RATE_OF_PAY_FROM
    #compute for Year
    X.iloc[X[X.WAGE_UNIT_OF_PAY=='Year'].index,X.columns.get_loc('WAGE_PER_HR')]=X[X.WAGE_UNIT_OF_PAY=='Year'].WAGE_RATE_OF_PAY_FROM/2067
    #compute for Month
    X.iloc[X[X.WAGE_UNIT_OF_PAY=='Month'].index,X.columns.get_loc('WAGE_PER_HR')]=X[X.WAGE_UNIT_OF_PAY=='Month'].WAGE_RATE_OF_PAY_FROM/172

    #initialize with WAGE_RATE_OF_PAY_FROM
    X['PW_WAGE_PER_HR']=X.PREVAILING_WAGE
    #compute for Year
    X.iloc[X[X.PW_UNIT_OF_PAY=='Year'].index,X.columns.get_loc('PW_WAGE_PER_HR')]=X[X.PW_UNIT_OF_PAY=='Year'].PREVAILING_WAGE/2067
    #compute for Month
    X.iloc[X[X.PW_UNIT_OF_PAY=='Month'].index,X.columns.get_loc('PW_WAGE_PER_HR')]=X[X.PW_UNIT_OF_PAY=='Month'].PREVAILING_WAGE/172

    X['WAGE_ABOVE_PW_HR']=X.WAGE_PER_HR-X.PW_WAGE_PER_HR

    return X

In [10]:
#fe_cols=list(observations_df[(observations_df.preprocess_comment.isin(['Feature engineering'])) & (~observations_df.preprocess_action.isin(['New feature']))].index)
fe_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/feature_engineering_columns.csv',header=None,squeeze=True)

In [None]:

bf=buildfeatures_Transformer(fe_cols)
fe_df=bf.transform(LCA_df)

In [11]:
#Custom transformer to drop features for input feature list
class dropfeatures_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, inplace):
      self.columns = columns # list of categorical columns in input Dataframe
      self.inplace=True

    def fit( self, X, y=None):
      return self 
    
    def transform(self, X, y=None):
      X.drop(columns=self.columns,inplace=self.inplace)
      return X

In [63]:
#drop_cols=set(LCA_df.columns.values)-set(observations_df[observations_df.preprocess_action.isin(['New feature','Use feature as is'])].index.values)
drop_cols=read_csv_to_list('https://github.com/sharsulkar/H1B_LCA_outcome_prediction/raw/main/data/processed/drop_columns.csv',header=None,squeeze=True)

In [None]:

df=dropfeatures_Transformer(columns=list(drop_cols),inplace=True)
df.transform(fe_df)

In [106]:
#Custom transformer to compute Random Standard encoding
#add option to return ordered encoding, whether to include encoding for missing value or not
class RSE_Transformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self, cat_cols, categories=None, RSE=None ):
        self.cat_cols = cat_cols # list of categorical columns in input Dataframe
        self.categories = categories # Array of unique non-numeric values in each categorical column
        self.RSE = RSE # Array of Random Standard encoding for each row in categories
        
    #Return self, nothing else to do here
    def fit( self, X, y=None ):
      #identify categorical columns
      #self.cat_cols=list(X.select_dtypes('O').columns)
      #Get a list of all unique categorical values for each column
      self.categories = [X[column].unique() for column in X[self.cat_cols]]
      #replace missing values and append missing value label to each column to handle missing values in test dataset that might not be empty in train dataset
      for i in range(len(self.categories)):
        if np.array(self.categories[i].astype(str)!=str(np.nan)).all():
          self.categories[i]=np.append(self.categories[i],np.nan)
      #compute RandomStandardEncoding 
      self.RSE=[np.random.normal(0,1,len(self.categories[i])) for i in range(len(self.cat_cols))]
      return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y=None):
      for i in range(len(self.cat_cols)):
        #Temporary measure to handle previously unseen values
        #replace unseen values with NaN
        X.loc[X[~X[(str(self.cat_cols[i]))].isin(self.categories[i])].index,(str(self.cat_cols[i]))]=np.NaN

        #replace seen values with encoding
        X.loc[:,(str(self.cat_cols[i]))].replace(dict(zip(self.categories[i], self.RSE[i])),inplace=True)
      return X    

In [14]:
#cat_cols=observations_df[(observations_df['Categorical class'].isin(['Categorical','Ordinal','Binary'])) & (observations_df.preprocess_action!='Drop column') & (observations_df.preprocess_comment!='Target feature')].index.values
cat_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/categorical_columns.csv',header=None,squeeze=True)

In [None]:
#embed categorical features

#col_imputer=SimpleImputer(strategy='constant',fill_value='missing')
rse=RSE_Transformer(cat_cols)
rse.fit_transform(fe_df[cat_cols])

In [15]:
#num_cols=observations_df[(observations_df['Categorical class']=='Numerical') & (observations_df.preprocess_action!='Drop column')].index.values
num_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/numeric_columns.csv',header=None,squeeze=True)

In [None]:
#scale numerical features

num_imputer=SimpleImputer(strategy='mean')
std=StandardScaler()
X=std.fit_transform(fe_df[num_cols])

In [None]:
LCA_df=LCA_dfcopy.copy()

In [16]:
#Build preprocessing pipeline
build_feature_pipe=make_pipeline(
    droprows_Transformer(row_index=drop_row_index,inplace=True,reset_index=True),
    buildfeatures_Transformer(fe_cols)
    )

numerical_preprocess=make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)
preprocess_pipe=make_column_transformer(
    (dropfeatures_Transformer(columns=drop_cols,inplace=True),drop_cols),
    (RSE_Transformer(cat_cols),cat_cols),
    (numerical_preprocess,num_cols),
    remainder='passthrough'
)
all_preprocess=make_pipeline(
    preprocess_pipe
)

In [17]:
#apply pipeline
#feature engineering + drop rows
fe_df=build_feature_pipe.fit_transform(LCA_df)
#Separate target column - add conditions to apply only on training dataset
#y=fe_df.pop('CASE_STATUS')
#drop columns + encoding
X=all_preprocess.fit_transform(fe_df)

In [21]:
X.shape

(256141, 31)

In [22]:
#save transformed dataset and target
final_cols=['VISA_CLASS', 'SOC_TITLE', 'FULL_TIME_POSITION',
       'TOTAL_WORKER_POSITIONS', 'NEW_EMPLOYMENT', 'CONTINUED_EMPLOYMENT',
       'CHANGE_PREVIOUS_EMPLOYMENT', 'NEW_CONCURRENT_EMPLOYMENT',
       'CHANGE_EMPLOYER', 'AMENDED_PETITION', 'EMPLOYER_NAME', 'NAICS_CODE',
       'AGENT_REPRESENTING_EMPLOYER', 'WORKSITE_WORKERS', 'SECONDARY_ENTITY',
       'PW_WAGE_LEVEL', 'TOTAL_WORKSITE_LOCATIONS', 'AGREE_TO_LC_STATEMENT',
       'H-1B_DEPENDENT', 'WILLFUL_VIOLATOR', 'PUBLIC_DISCLOSURE',
       'PROCESSING_DAYS', 'VALIDITY_DAYS', 'SOC_CD2', 'SOC_CD4', 'SOC_CD_ONET',
       'USA_YN', 'EMPLOYER_WORKSITE_YN', 'OES_YN', 'SURVEY_YEAR',
       'WAGE_ABOVE_PW_HR']
pd.DataFrame(X,columns=final_cols).to_csv('/content/drive/MyDrive/Datasets/LCA_train_q1q2processed.csv')
pd.DataFrame(y,columns=['CASE_STATUS']).to_csv('/content/drive/MyDrive/Datasets/LCA_train_q1q2_expected.csv')

In [23]:
from pickle import dump, load

In [24]:
#save pipeline
#reference - https://machinelearningmastery.com/how-to-save-and-load-models-and-data-preparation-in-scikit-learn-for-later-use/
dump(all_preprocess,open('/content/drive/MyDrive/preprocess_pipe.pkl','wb'))
#all_preprocess=load(open('/content/drive/MyDrive/preprocess_pipe.pkl','rb'))

In [None]:
X.shape

(104944, 31)

In [44]:
test_df=pd.read_excel('/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q3.xlsx',usecols=required_features)

In [45]:
test_df_copy=test_df.copy()

In [59]:
test_df=test_df_copy.copy()

In [None]:
drop_row_index=test_df[~test_df.CASE_STATUS.isin(['Certified','Denied'])].index
dr=droprows_Transformer(row_index=drop_row_index,inplace=True,reset_index=True)
dr.transform(test_df)

In [61]:
bf=buildfeatures_Transformer(fe_cols)
fe_df_test=bf.transform(test_df)

In [None]:
df=dropfeatures_Transformer(columns=list(drop_cols),inplace=True)
df.transform(fe_df_test)

In [70]:
all_preprocess=load(open('/content/drive/MyDrive/preprocess_pipe.pkl','rb'))

In [107]:
rse=RSE_Transformer(cat_cols,all_preprocess['columntransformer'].transformers_[1][1].categories,all_preprocess['columntransformer'].transformers_[1][1].RSE)
rse.transform(fe_df_test[cat_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


Unnamed: 0,VISA_CLASS,SOC_TITLE,FULL_TIME_POSITION,EMPLOYER_NAME,NAICS_CODE,AGENT_REPRESENTING_EMPLOYER,SECONDARY_ENTITY,PW_WAGE_LEVEL,AGREE_TO_LC_STATEMENT,H-1B_DEPENDENT,WILLFUL_VIOLATOR,PUBLIC_DISCLOSURE,SOC_CD2,SOC_CD4,SOC_CD_ONET,USA_YN,EMPLOYER_WORKSITE_YN,OES_YN,SURVEY_YEAR
0,0.496714,-0.792521,0.059630,-1.119670,-1.483366,0.610284,1.004394,1.872325,-1.645468,-0.700877,-0.266956,0.460467,0.329237,0.433790,0.238151,-0.308344,-0.281646,0.802063,-0.480176
1,0.496714,0.899600,0.059630,-0.497962,0.062714,0.610284,0.302875,1.068347,-1.645468,-1.478855,-0.266956,0.460467,0.329237,0.649480,0.238151,-0.308344,-0.574633,0.802063,-0.480176
2,0.496714,-1.703382,0.059630,-0.497962,-1.793796,0.610284,0.302875,1.872325,-1.645468,-1.478855,-0.266956,0.460467,2.193315,1.257878,0.238151,-0.308344,-0.574633,0.802063,-0.480176
3,0.496714,-0.792521,0.059630,-1.119670,-1.483366,0.610284,1.004394,1.872325,-1.645468,-0.700877,-0.266956,0.460467,0.329237,0.433790,0.238151,-0.308344,-0.281646,0.802063,-0.480176
4,0.496714,0.633919,0.059630,0.058370,0.076159,0.610284,0.302875,-0.340250,-1.645468,-1.478855,-0.266956,0.460467,0.329237,0.013480,0.238151,-0.308344,-0.574633,0.802063,-0.480176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182764,-0.138264,0.899600,0.059630,-0.497962,0.457844,0.610284,0.302875,0.178053,-1.645468,-1.478855,-0.266956,0.460467,0.329237,0.649480,0.238151,-0.308344,-0.574633,0.802063,-0.480176
182765,-0.138264,0.747294,-0.646937,-0.497962,-0.626133,0.610284,0.302875,1.068347,-1.645468,-0.706819,-1.188291,0.460467,-1.817741,1.481575,0.238151,-0.308344,-0.281646,0.802063,-0.480176
182766,-0.138264,-0.825497,0.059630,0.746685,-0.292753,1.765707,0.302875,1.872325,-1.645468,-0.706819,-1.188291,-1.531903,0.225904,-0.228927,0.238151,-0.308344,-0.574633,0.802063,-0.480176
182767,-0.138264,-0.108760,0.059630,1.056030,-0.569812,0.610284,0.302875,0.178053,-1.645468,-0.706819,-1.188291,0.460467,0.225904,-1.768187,0.238151,-0.308344,-0.574633,0.802063,-0.480176


In [None]:
y_test=fe_df_test.pop('CASE_STATUS')

In [None]:
num_imputer=SimpleImputer(strategy='mean')
std=StandardScaler()
X_test=num_imputer.transform(fe_df_test[num_cols])
X_test=std.transform(fe_df_test[num_cols])

In [102]:
fe_df_test.loc[fe_df_test[~fe_df_test.EMPLOYER_NAME.isin(all_preprocess['columntransformer'].transformers_[1][1].categories[3])].index,'EMPLOYER_NAME']=np.NaN

In [None]:
fe_df_test[fe_df_test.EMPLOYER_NAME.isin(all_preprocess['columntransformer'].transformers_[1][1].categories[3])]