<a href="https://colab.research.google.com/github/sharsulkar/H1B_LCA_outcome_prediction/blob/main/prototyping/notebooks/03_sh_build_modular_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer
from pickle import dump, load

In [2]:
def read_csv_to_list(filepath,header=None,squeeze=True):
  return list(pd.read_csv(filepath,header=None,squeeze=True))

In [4]:
#Import Q1 and Q2 data into 2 separate dataframes 
required_features=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/required_features.csv',header=None,squeeze=True)

data1_df=pd.read_excel('/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q1.xlsx',usecols=required_features)
data1_dfcopy=data1_df.copy()

data2_df=pd.read_excel('/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q2.xlsx',usecols=required_features)
data2_dfcopy=data1_df.copy()

In [5]:
#Import Q1 and Q2 data into single dataframe
data_files_list_path='/content/drive/MyDrive/Datasets/LCA_files_list.txt'
#create an empty dataframe to hold the final concatenated result
input_df=pd.DataFrame(columns=required_features)

#define the file object 
file_itr=open(file=data_files_list_path,mode='r')

#iterate through the file and append the data to input_df
for path in file_itr:
  data_df=pd.read_excel(path,usecols=required_features)
  input_df=input_df.append(data_df,ignore_index=True)    

file_itr.close

input_dfcopy=input_df.copy()

In [6]:
#Custom transformer to drop rows based on filter
class droprows_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
      self.row_index = None # row index to drop
      self.inplace=True
      self.reset_index=True

    def fit( self, X, y=None):
      return self 
    
    def transform(self, X, y=None):
      self.row_index=X[~X.CASE_STATUS.isin(['Certified','Denied'])].index
      X.drop(index=self.row_index,inplace=self.inplace)
      if self.reset_index:
        X.reset_index(inplace=True)#,drop=True
      return X

In [7]:
class buildfeatures_Transformer(BaseEstimator, TransformerMixin):
  def __init__(self, input_columns):
    self.input_columns=input_columns

  def date_diff(self,date1,date2):
    return date1-date2

  def is_USA(self,country):
    if country=='UNITED STATES OF AMERICA':
      USA_YN='Y' 
    else:
      USA_YN='N'
    return USA_YN

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    # Processing_Days and Validity_days
    X['PROCESSING_DAYS']=self.date_diff(X.DECISION_DATE, X.RECEIVED_DATE).dt.days
    X['VALIDITY_DAYS']=self.date_diff(X.END_DATE, X.BEGIN_DATE).dt.days

    # SOC_Codes
    X['SOC_CD2']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[0]
    X['SOC_CD4']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[0]
    X['SOC_CD_ONET']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[1]

    # USA_YN
    X['USA_YN']=X.EMPLOYER_COUNTRY.apply(self.is_USA)

    # Employer_Worksite_YN
    X['EMPLOYER_WORKSITE_YN']='Y'
    X.loc[X.EMPLOYER_POSTAL_CODE.ne(X.WORKSITE_POSTAL_CODE),'EMPLOYER_WORKSITE_YN']='N'

    # OES_YN
    X['OES_YN']='Y'
    X.iloc[X[~X.PW_OTHER_SOURCE.isna()].index,X.columns.get_loc('OES_YN')]='N'

    # SURVEY_YEAR
    X['SURVEY_YEAR']=pd.to_datetime(X.PW_OES_YEAR.str.split(pat='-',n=1,expand=True)[0]).dt.to_period('Y')
    PW_other_year=X[X.OES_YN=='N'].PW_OTHER_YEAR
    #Rename the series and update dataframe with series object
    PW_other_year.rename("SURVEY_YEAR",inplace=True)
    X.update(PW_other_year)

    # WAGE_ABOVE_PREVAILING_HR
    X['WAGE_PER_HR']=X.WAGE_RATE_OF_PAY_FROM
    #compute for Year
    X.iloc[X[X.WAGE_UNIT_OF_PAY=='Year'].index,X.columns.get_loc('WAGE_PER_HR')]=X[X.WAGE_UNIT_OF_PAY=='Year'].WAGE_RATE_OF_PAY_FROM/2067
    #compute for Month
    X.iloc[X[X.WAGE_UNIT_OF_PAY=='Month'].index,X.columns.get_loc('WAGE_PER_HR')]=X[X.WAGE_UNIT_OF_PAY=='Month'].WAGE_RATE_OF_PAY_FROM/172

    #initialize with WAGE_RATE_OF_PAY_FROM
    X['PW_WAGE_PER_HR']=X.PREVAILING_WAGE
    #compute for Year
    X.iloc[X[X.PW_UNIT_OF_PAY=='Year'].index,X.columns.get_loc('PW_WAGE_PER_HR')]=X[X.PW_UNIT_OF_PAY=='Year'].PREVAILING_WAGE/2067
    #compute for Month
    X.iloc[X[X.PW_UNIT_OF_PAY=='Month'].index,X.columns.get_loc('PW_WAGE_PER_HR')]=X[X.PW_UNIT_OF_PAY=='Month'].PREVAILING_WAGE/172

    X['WAGE_ABOVE_PW_HR']=X.WAGE_PER_HR-X.PW_WAGE_PER_HR

    return X

In [8]:
#Custom transformer to drop features for input feature list
class dropfeatures_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, inplace):
      self.columns = columns # list of categorical columns in input Dataframe
      self.inplace=True

    def fit( self, X, y=None):
      return self 
    
    def transform(self, X, y=None):
      X.drop(columns=self.columns,inplace=self.inplace)
      return X

In [9]:
#Custom transformer to compute Random Standard encoding
#add option to return ordered encoding, whether to include encoding for missing value or not
class RSE_Transformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self, cat_cols, categories=None, RSE=None ):
        self.cat_cols = cat_cols # list of categorical columns in input Dataframe
        self.categories = categories # Array of unique non-numeric values in each categorical column
        self.RSE = RSE # Array of Random Standard encoding for each row in categories
        
    #Return self, nothing else to do here
    def fit( self, X, y=None ):
      #identify categorical columns
      #self.cat_cols=list(X.select_dtypes('O').columns)
      #Get a list of all unique categorical values for each column
      self.categories = [X[column].unique() for column in X[self.cat_cols]]
      #replace missing values and append missing value label to each column to handle missing values in test dataset that might not be empty in train dataset
      for i in range(len(self.categories)):
        if np.array(self.categories[i].astype(str)!=str(np.nan)).all():
          self.categories[i]=np.append(self.categories[i],np.nan)
      #compute RandomStandardEncoding 
      self.RSE=[np.random.normal(0,1,len(self.categories[i])) for i in range(len(self.cat_cols))]
      return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y=None):
      for i in range(len(self.cat_cols)):
        #Temporary measure to handle previously unseen values
        #replace unseen values with NaN
        X.loc[X[~X[(str(self.cat_cols[i]))].isin(self.categories[i])].index,(str(self.cat_cols[i]))]=np.NaN

        #replace seen values with encoding
        X.loc[:,(str(self.cat_cols[i]))].replace(dict(zip(self.categories[i], self.RSE[i])),inplace=True)
      return X    

In [107]:
class CustomStandardScaler(BaseEstimator, TransformerMixin):
  def __init__(self,mean=None,var=None,n_samples_seen=None,scale=None):
    self.mean=None #mean
    self.var=None
    self.n_samples_seen=None
    self.scale=None

  def compute_sample_mean(self,X):
    return np.mean(X,axis=0)

  def compute_sample_var(self,X):
    return np.var(X,axis=0)

  def compute_sample_size(self,X):
    #assuming X is imputed, if there are null values, throw error aksing that X be imputed first
    return len(X)

  def compute_pooled_mean(self,X):
    #compute the sample mean and size
    sample_mean=self.compute_sample_mean(X)
    sample_count=self.compute_sample_size(X) 
    #compute pool mean
    pool_mean=(self.mean*self.n_samples_seen + sample_mean*sample_count)/(self.n_samples_seen + sample_count)

    return pool_mean

  def compute_pooled_var(self,X):
    #compute the sample var and size
    sample_var=self.compute_sample_var(X)
    sample_count=self.compute_sample_size(X) 
    #compute pool variance
    pool_var=(self.var*(self.n_samples_seen - 1) + sample_var*(sample_count - 1))/(self.n_samples_seen + sample_count - 2)

    return pool_var

  def fit(self,X):
    if self.mean is None:
      self.mean=self.compute_sample_mean(X)
    else: 
      self.mean=self.compute_pooled_mean(X)
    
    if self.var is None:
      self.var=self.compute_sample_var(X)
    else: 
      self.var=self.compute_pooled_var(X)

    if self.n_samples_seen is None:
      self.n_samples_seen=self.compute_sample_size(X) 
    else: 
      self.n_samples_seen+=self.compute_sample_size(X)

  def transform(self,X):
    return (X-self.mean)/np.sqrt(self.var)

  def inverse_transform(self,X):
    return X*np.sqrt(self.var) + self.mean



In [111]:
c_scaler=CustomStandardScaler()
c_scaler.fit(fed1_arr)

In [112]:
c_scaler.fit(fed2_arr)

In [78]:
print('mean: ',c_scaler.mean)
print('var: ',c_scaler.var)
print('n_samples: ',c_scaler.n_samples_seen)

mean:  [1.65956641e+00 5.64872473e-01 3.72318372e-01 1.52552696e-01
 1.11149718e-02 2.88552008e-01 2.73025404e-01 1.63905427e+00
 1.14193745e+00 7.18051386e+00 1.06073344e+03 1.77432427e+02]
var:  [2.56411923e+01 1.21196638e+01 7.62461704e-01 6.40281393e-01
 3.13458405e-02 8.61782507e-01 8.69391342e-01 2.52004592e+01
 2.74346269e-01 7.59405744e-01 1.76069479e+04 2.19590546e+07]
n_samples:  256141


In [113]:
z=c_scaler.transform(fed12_arr)
np.sum(fed12_arr-c_scaler.inverse_transform(z),axis=0)

array([-1.38200562e-12, -1.36779477e-13,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.91846539e-13,
        0.00000000e+00, -9.76996262e-15, -1.70530257e-12,  1.18363985e-10])

In [114]:
np.sum(z-d1d2_numscaler.transform(fed12_arr),axis=0)

array([-8.89399665e-13, -4.52160531e-12, -3.41437989e-12, -4.35831926e-12,
       -2.35922393e-13,  1.69825265e-12,  2.38609132e-12,  5.37184186e-12,
       -4.24965618e-12, -2.30082620e-12, -4.88693461e-12,  5.01965136e-12])

In [106]:
c_scaler.transform(fed2_arr)


151197

In [79]:
print('mean: ',d1d2_numscaler.mean_)
print('var: ',d1d2_numscaler.var_)
print('n_samples: ',d1d2_numscaler.n_samples_seen_)

mean:  [1.65956641e+00 5.64872473e-01 3.72318372e-01 1.52552696e-01
 1.11149718e-02 2.88552008e-01 2.73025404e-01 1.63905427e+00
 1.14193745e+00 7.18051386e+00 1.06073344e+03 1.77432427e+02]
var:  [2.56622467e+01 1.21703331e+01 7.63296334e-01 6.40287980e-01
 3.13474050e-02 8.62958765e-01 8.69566038e-01 2.52277475e+01
 2.74379042e-01 7.64549919e-01 1.76283923e+04 2.19819023e+07]
n_samples:  256141


In [80]:
print('mean difference: ',d1d2_numscaler.mean_ - c_scaler.mean)
print('var difference: ',d1d2_numscaler.var_ - c_scaler.var)
print('n_samples difference: ',d1d2_numscaler.n_samples_seen_ - c_scaler.n_samples_seen)

mean difference:  [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 8.52651283e-14]
var difference:  [2.10544408e-02 5.06692761e-02 8.34629199e-04 6.58660272e-06
 1.56454454e-06 1.17625763e-03 1.74695240e-04 2.72883905e-02
 3.27728944e-05 5.14417441e-03 2.14444297e+01 2.28477565e+04]
n_samples difference:  0


In [10]:
fe_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/feature_engineering_columns.csv',header=None,squeeze=True)
drop_cols=read_csv_to_list('https://github.com/sharsulkar/H1B_LCA_outcome_prediction/raw/main/data/processed/drop_columns.csv',header=None,squeeze=True)
cat_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/categorical_columns.csv',header=None,squeeze=True)
num_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/numeric_columns.csv',header=None,squeeze=True)

In [11]:
#Build preprocessing pipeline
build_feature_pipe=make_pipeline(
    droprows_Transformer(),
    buildfeatures_Transformer(fe_cols)
    )

numerical_preprocess=make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)
preprocess_pipe=make_column_transformer(
    (dropfeatures_Transformer(columns=drop_cols,inplace=True),drop_cols),
    (RSE_Transformer(cat_cols),cat_cols),
    (numerical_preprocess,num_cols),
    remainder='passthrough'
)
all_preprocess=make_pipeline(
    preprocess_pipe
)

In [12]:
#drop unneeded rows+build features
fed1_df=build_feature_pipe.fit_transform(data1_df)
fed2_df=build_feature_pipe.fit_transform(data2_df)
fed12_df=build_feature_pipe.fit_transform(input_df)

In [13]:
#Instantiate imputer
d1_impute=SimpleImputer(strategy='median',copy=False)
d2_impute=SimpleImputer(strategy='median',copy=False)
d1d2_impute=SimpleImputer(strategy='median',copy=False)

In [14]:
#scaling numerical columns
d1_numscaler=StandardScaler()
d2_numscaler=StandardScaler()
d1d2_numscaler=StandardScaler()

In [89]:
#Impute missing values
fed1_arr=d1_impute.fit_transform(fed1_df[num_cols])
fed2_arr=d2_impute.fit_transform(fed2_df[num_cols])
fed12_arr=d1d2_impute.fit_transform(fed12_df[num_cols])

In [51]:
#Fit scaler to get parameters
d1_numscaler.fit(fed1_arr)
d2_numscaler.fit(fed2_arr)
d1d2_numscaler.fit(fed12_arr)

StandardScaler(copy=True, with_mean=True, with_std=True)

new_mean=(mean1\*N1 + mean2\*N2)/(N1+N1)  
https://www.statisticshowto.com/combined-mean/

In [None]:
#compute pooled mean of two separate datasets
(d1_numscaler.mean_*d1_numscaler.n_samples_seen_+d2_numscaler.mean_*d2_numscaler.n_samples_seen_)/(d1_numscaler.n_samples_seen_+d2_numscaler.n_samples_seen_)

array([1.65956641e+00, 5.64872473e-01, 3.72318372e-01, 1.52552696e-01,
       1.11149718e-02, 2.88552008e-01, 2.73025404e-01, 1.63905427e+00,
       1.14193745e+00, 7.18051386e+00, 1.06073344e+03, 1.77432427e+02])

In [None]:
#compared to the mean of the combined datasets
d1d2_numscaler.mean_

array([1.65956641e+00, 5.64872473e-01, 3.72318372e-01, 1.52552696e-01,
       1.11149718e-02, 2.88552008e-01, 2.73025404e-01, 1.63905427e+00,
       1.14193745e+00, 7.18051386e+00, 1.06073344e+03, 1.77432427e+02])

In [81]:
#compute pooled variance of two separate datasets
d1d2_numscaler.var_-(d1_numscaler.var_*(d1_numscaler.n_samples_seen_-1)+d2_numscaler.var_*(d2_numscaler.n_samples_seen_-1))/(d1_numscaler.n_samples_seen_+d2_numscaler.n_samples_seen_-2)

array([2.10544408e-02, 5.06692761e-02, 8.34629199e-04, 6.58660272e-06,
       1.56454454e-06, 1.17625763e-03, 1.74695240e-04, 2.72883905e-02,
       3.27728944e-05, 5.14417441e-03, 2.14444297e+01, 2.28477565e+04])

In [None]:
#compared to the variance of the combined datasets
d1d2_numscaler.var_

array([2.56622467e+01, 1.21703331e+01, 7.63296334e-01, 6.40287980e-01,
       3.13474050e-02, 8.62958765e-01, 8.69566038e-01, 2.52277475e+01,
       2.74379042e-01, 7.64549919e-01, 1.76283923e+04, 2.19819023e+07])

In [None]:
#Instantiate a standardscaler with the pooled mean and variance calcuated above
d3_numscaler=StandardScaler()
d3_numscaler.mean_=(d1_numscaler.mean_*d1_numscaler.n_samples_seen_+d2_numscaler.mean_*d2_numscaler.n_samples_seen_)/(d1_numscaler.n_samples_seen_+d2_numscaler.n_samples_seen_) #pooled mean
d3_numscaler.var_=(d1_numscaler.var_*(d1_numscaler.n_samples_seen_-1)+d2_numscaler.var_*(d2_numscaler.n_samples_seen_-1))/(d1_numscaler.n_samples_seen_+d2_numscaler.n_samples_seen_-2) #pooled variance
d3_numscaler.n_samples_seen_=d1_numscaler.n_samples_seen_+d2_numscaler.n_samples_seen_ #total number of samples
d3_numscaler.scale_=np.sqrt(d3_numscaler.var_) #sqrt(var) according to the method

In [None]:
#Scale the combined dataset using the new scaler and the combined scaler defined earlier
d3_scaled=d3_numscaler.transform(fed12_arr)
d1d2_scaled=d1d2_numscaler.transform(fed12_arr)

In [None]:
#compare the difference between the transformed data of these 2 scalers, ideally all columns should have 0 difference
#but looks like columns 7 (WORKSITE_WORKERS) and 8 (TOTAL_WORKSITE_LOCATIONS) have differences.
np.sum(d1d2_scaled-d3_scaled,axis=0)

array([ 8.89399665e-13,  4.52160531e-12,  3.41437989e-12,  4.35831926e-12,
        2.35922393e-13, -1.69825265e-12, -2.38609132e-12, -5.37184186e-12,
        4.24965618e-12,  2.30082620e-12,  4.88693461e-12, -5.01965136e-12])

In [None]:
#check the indexes where the difference in non-zero along the columns
np.argwhere(np.floor(np.sum(d1d2_scaled-d3_scaled,axis=1)))

array([[   321],
       [  1028],
       [  1029],
       ...,
       [255881],
       [256004],
       [256111]])

In [None]:
#check a sample
d1d2_scaled[321]-d3_scaled[321]

array([ 5.34438295e-05,  3.38119407e-04, -3.93114960e-04,  9.80599544e-07,
        1.56667952e-06,  2.11912095e-04,  2.94147881e-05,  6.88684730e-05,
        1.61843142e-05,  4.56505871e-03, -1.57120356e-04, -2.32644563e-02])

In [None]:
#check difference between the data and its inverse transform
np.sum(fed12_arr-d1d2_numscaler.inverse_transform(d3_scaled),axis=0)

array([ 2.69968492e-11,  1.99740890e-10, -2.74292811e-11,  8.39245340e-13,
       -4.24105195e-14, -6.27992103e-12, -7.39930339e-12,  1.62687641e-11,
       -8.20465917e-12,  3.38216566e-10,  2.56568455e-09, -1.58673377e-08])