<a href="https://colab.research.google.com/github/sharsulkar/H1B_LCA_outcome_prediction/blob/main/prototyping/notebooks/05_sh_batch_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Training the selected model on entire training data
We are using FY20 H1B LCA dataset which is available as 4 .xlsx files - one for each quarter. As each file has more than 150K records, it is easier if the model is incrementaly trained on each file separately.

### Import libraries, custom functions and define preprocessing transform classes

In [11]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer
from pickle import dump, load
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, confusion_matrix
import time

In [2]:
def read_csv_to_list(filepath,header=None,squeeze=True):
  return list(pd.read_csv(filepath,header=None,squeeze=True))

In [3]:
#Custom transformer to drop rows based on filter
class droprows_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
      self.row_index = None # row index to drop
      self.inplace=True
      self.reset_index=True

    def fit( self, X, y=None):
      return self 
    
    def transform(self, X, y=None):
      self.row_index=X[~X.CASE_STATUS.isin(['Certified','Denied'])].index
      X.drop(index=self.row_index,inplace=self.inplace)
      if self.reset_index:
        X.reset_index(inplace=True,drop=True)
      return X

In [4]:
class buildfeatures_Transformer(BaseEstimator, TransformerMixin):
  def __init__(self, input_columns):
    self.input_columns=input_columns

  def date_diff(self,date1,date2):
    return date1-date2

  def is_USA(self,country):
    if country=='UNITED STATES OF AMERICA':
      USA_YN='Y' 
    else:
      USA_YN='N'
    return USA_YN

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    # Processing_Days and Validity_days
    X['PROCESSING_DAYS']=self.date_diff(X.DECISION_DATE, X.RECEIVED_DATE).dt.days
    X['VALIDITY_DAYS']=self.date_diff(X.END_DATE, X.BEGIN_DATE).dt.days

    # SOC_Codes - removed as this feature has low imporance for selected model
    #X['SOC_CD2']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[0]
    #X['SOC_CD4']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[0]
    #X['SOC_CD_ONET']=X.SOC_CODE.str.split(pat='-',n=1,expand=True)[1].str.split(pat='.',n=1,expand=True)[1]

    # USA_YN
    X['USA_YN']=X.EMPLOYER_COUNTRY.apply(self.is_USA)

    # Employer_Worksite_YN
    X['EMPLOYER_WORKSITE_YN']='Y'
    X.loc[X.EMPLOYER_POSTAL_CODE.ne(X.WORKSITE_POSTAL_CODE),'EMPLOYER_WORKSITE_YN']='N'

    # OES_YN
    X['OES_YN']='Y'
    X.iloc[X[~X.PW_OTHER_SOURCE.isna()].index,X.columns.get_loc('OES_YN')]='N'

    # SURVEY_YEAR
    X['SURVEY_YEAR']=pd.to_datetime(X.PW_OES_YEAR.str.split(pat='-',n=1,expand=True)[0]).dt.to_period('Y')
    PW_other_year=X[X.OES_YN=='N'].PW_OTHER_YEAR
    #Rename the series and update dataframe with series object
    PW_other_year.rename("SURVEY_YEAR",inplace=True)
    X.update(PW_other_year)

    # WAGE_ABOVE_PREVAILING_HR
    X['WAGE_PER_HR']=X.WAGE_RATE_OF_PAY_FROM
    #compute for Year
    X.iloc[X[X.WAGE_UNIT_OF_PAY=='Year'].index,X.columns.get_loc('WAGE_PER_HR')]=X[X.WAGE_UNIT_OF_PAY=='Year'].WAGE_RATE_OF_PAY_FROM/2067
    #compute for Month
    X.iloc[X[X.WAGE_UNIT_OF_PAY=='Month'].index,X.columns.get_loc('WAGE_PER_HR')]=X[X.WAGE_UNIT_OF_PAY=='Month'].WAGE_RATE_OF_PAY_FROM/172

    #initialize with WAGE_RATE_OF_PAY_FROM
    X['PW_WAGE_PER_HR']=X.PREVAILING_WAGE
    #compute for Year
    X.iloc[X[X.PW_UNIT_OF_PAY=='Year'].index,X.columns.get_loc('PW_WAGE_PER_HR')]=X[X.PW_UNIT_OF_PAY=='Year'].PREVAILING_WAGE/2067
    #compute for Month
    X.iloc[X[X.PW_UNIT_OF_PAY=='Month'].index,X.columns.get_loc('PW_WAGE_PER_HR')]=X[X.PW_UNIT_OF_PAY=='Month'].PREVAILING_WAGE/172

    X['WAGE_ABOVE_PW_HR']=X.WAGE_PER_HR-X.PW_WAGE_PER_HR

    return X

In [5]:
#Custom transformer to drop features for input feature list
class dropfeatures_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, inplace):
      self.columns = columns # list of categorical columns in input Dataframe
      self.inplace=True

    def fit( self, X, y=None):
      return self 
    
    def transform(self, X, y=None):
      X.drop(columns=self.columns,inplace=self.inplace)
      return X

In [6]:
#Custom transformer to compute Random Standard encoding for categorical features for incrementaly encoding data
class RSE_Transformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self, cat_cols, categories, RSE ):
        self.cat_cols = cat_cols # list of categorical columns in input Dataframe
        self.categories = categories # Array of unique non-numeric values in each categorical column
        self.RSE = RSE # Array of Random Standard encoding for each row in categories
        
    def fit( self, X, y=None ):
      #Get a list of all unique categorical values for each column
      if self.categories is None:
        self.categories = [X[column].unique() for column in cat_cols]

        #replace missing values and append missing value label to each column to handle missing values in test dataset that might not be empty in train dataset
        for i in range(len(self.categories)):
          if np.array(self.categories[i].astype(str)!=str(np.nan)).all():
            self.categories[i]=np.append(self.categories[i],np.nan)

        #compute RandomStandardEncoding 
        self.RSE=[np.random.normal(0,1,len(self.categories[i])) for i in range(len(self.cat_cols))]

      else:
        for i in range(len(self.cat_cols)):
          #append new unique categories to self.categories
          new_categories=list(set(X[self.cat_cols[i]].unique()).difference(set(self.categories[i])))
          if new_categories!=[]:
            #print('not empty') #replace with logging call
            #print('categories before append',len(categories[i])) #logging call
            self.categories[i]=np.append(self.categories[i],new_categories) #append new categories to the end
            new_RSE=np.random.normal(0,1,len(new_categories)) #generate new RSE values
            #regenrate if overlap found with existing encodings
            if set(new_RSE).issubset(set(self.RSE[i])): 
              #print('yes') #loggin call
              new_RSE=np.random.normal(0,1,len(new_categories))
            
            self.RSE[i]=np.append(self.RSE[i],new_RSE) #append new RSE values
          #print('new categories',len(new_categories)) #logging call
          #print('categories after append',len(categories[i]))
     
      return self 
    
    def transform(self, X, y=None):
      for i in range(len(self.cat_cols)):
        #Temporary measure to handle previously unseen values
        #replace unseen values with NaN
        X.loc[X[~X[(str(self.cat_cols[i]))].isin(self.categories[i])].index,(str(self.cat_cols[i]))]=np.NaN

        #replace seen values with encoding
        X.loc[:,(str(self.cat_cols[i]))].replace(dict(zip(self.categories[i], self.RSE[i])),inplace=True)
      return X    

    def inverse_transform(self,X):
      for i in range(len(self.cat_cols)):
        X.loc[:,(str(self.cat_cols[i]))].replace(dict(zip(self.RSE[i], self.categories[i])),inplace=True)
      return X

In [7]:
#custom transformer for incrementally scaling to standard scale using pooled mean and variance
class CustomStandardScaler(BaseEstimator, TransformerMixin):
  def __init__(self,mean=None,var=None,n_samples_seen=None,scale=None):
    self.mean=mean 
    self.var=var
    self.n_samples_seen=n_samples_seen
    self.scale=scale

  def compute_sample_mean(self,X):
    return np.mean(X,axis=0)

  def compute_sample_var(self,X):
    return np.var(X,axis=0)

  def compute_sample_size(self,X):
    #assuming X is imputed, if there are null values, throw error aksing that X be imputed first
    return len(X)

  def compute_pooled_mean(self,X):
    #compute the sample mean and size
    sample_mean=self.compute_sample_mean(X)
    sample_count=self.compute_sample_size(X) 
    #compute pool mean
    pool_mean=(self.mean*self.n_samples_seen + sample_mean*sample_count)/(self.n_samples_seen + sample_count)

    return pool_mean

  def compute_pooled_var(self,X):
    #compute the sample var and size
    sample_var=self.compute_sample_var(X)
    sample_count=self.compute_sample_size(X) 
    #compute pool variance
    pool_var=(self.var*(self.n_samples_seen - 1) + sample_var*(sample_count - 1))/(self.n_samples_seen + sample_count - 2)

    return pool_var

  def fit(self,X):
    if self.mean is None:
      self.mean=self.compute_sample_mean(X)
    else: 
      self.mean=self.compute_pooled_mean(X)
    
    if self.var is None:
      self.var=self.compute_sample_var(X)
    else: 
      self.var=self.compute_pooled_var(X)

    if self.n_samples_seen is None:
      self.n_samples_seen=self.compute_sample_size(X) 
    else: 
      self.n_samples_seen+=self.compute_sample_size(X)
    return self

  def transform(self,X):
    return (X-self.mean)/np.sqrt(self.var)

  def inverse_transform(self,X):
    return X*np.sqrt(self.var) + self.mean



### Build preprocessing pipeline

In [10]:
#Build preprocessing pipeline
build_feature_pipe=make_pipeline(
    droprows_Transformer(),
    buildfeatures_Transformer(fe_cols)
    )

numerical_preprocess=make_pipeline(
    SimpleImputer(strategy='median'),
    CustomStandardScaler()
)
preprocess_pipe=make_column_transformer(
    (dropfeatures_Transformer(columns=drop_cols,inplace=True),drop_cols),
    (RSE_Transformer(cat_cols,None,None),cat_cols),
    (numerical_preprocess,num_cols),
    remainder='passthrough'
)
all_preprocess=make_pipeline(
    preprocess_pipe
)

### Apply preprocessing pipeline iteratively

In [8]:
fe_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/feature_engineering_columns.csv',header=None,squeeze=True)
drop_cols=read_csv_to_list('https://github.com/sharsulkar/H1B_LCA_outcome_prediction/raw/main/data/processed/drop_columns.csv',header=None,squeeze=True)
cat_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/categorical_columns.csv',header=None,squeeze=True)
num_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/numeric_columns.csv',header=None,squeeze=True)

In [9]:
#instantiate model
model=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=-1, penalty='elasticnet', power_t=0.5,
              random_state=42, shuffle=True, tol=0.001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [8]:
#Import data into separate dataframes 
'''
required_features=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/required_features.csv',header=None,squeeze=True)

data1_df=pd.read_excel('/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q1.xlsx',usecols=required_features)
data1_dfcopy=data1_df.copy()

data2_df=pd.read_excel('/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q2.xlsx',usecols=required_features)
data2_dfcopy=data2_df.copy()

data3_df=pd.read_excel('/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q3.xlsx',usecols=required_features)
data3_dfcopy=data3_df.copy()

data4_df=pd.read_excel('/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q4.xlsx',usecols=required_features)
data4_dfcopy=data4_df.copy()
'''

In [52]:
#do not run - this is to reset the dataframes if needed
#data1_df=data1_dfcopy.copy()
#data2_df=data2_dfcopy.copy()
#data3_df=data3_dfcopy.copy()
#data4_df=data4_dfcopy.copy()

### Train model iteratively

In [12]:
file_path=['/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q1.xlsx',
           '/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q2.xlsx',
           '/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q3.xlsx',
           '/content/drive/MyDrive/Datasets/H1B_LCA_prediction/LCA_Disclosure_Data_FY2020_Q4.xlsx']

required_features=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/required_features.csv',header=None,squeeze=True)

for path in file_path:
  start = time.time()

  #load data file into dataframe
  data_df=pd.read_excel(path,usecols=required_features)

  #build features + drop rows where CASE_STATUs not in ['Certified','Denied']
  fe_df=build_feature_pipe.fit_transform(data_df)

  #separate the target variable and encode
  y=fe_df.pop('CASE_STATUS')
  y.replace(['Certified','Denied'],[0,1],inplace=True)

  #apply remaining preprocess pipeline to the semi processed dataframe
  X=all_preprocess.fit_transform(fe_df)

  #save the build_feature_pipe and preprocess pipelines
  dump(all_preprocess,open('/content/drive/MyDrive/saved_models/H1B_LCA_prediction/pipeline_batch_train.pkl','wb'))
  dump(build_feature_pipe,open('/content/drive/MyDrive/saved_models/H1B_LCA_prediction/build_feature_pipe_batch_train.pkl','wb'))

  #fit model
  model.fit(X,y)

  #save the model
  dump(model,open('/content/drive/MyDrive/saved_models/H1B_LCA_prediction/final_batch_train.pkl','wb'))

  #check training metrics just to make sure model has fitted
  y_pred=model.predict(X)
  print(f1_score(y_true=y,y_pred=y_pred,average=None))
  print((time.time() - start))

[0.99762084 0.49848638]
109.69897174835205
[0.99846156 0.77894737]
125.13341045379639
[0.99977651 0.97398008]
154.79350399971008
[0.99966896 0.91618829]
90.35153293609619
