<a href="https://colab.research.google.com/github/sharsulkar/H1B_LCA_outcome_prediction/blob/main/prototyping/notebooks/02_sh_make_observations_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [20]:
def read_csv_to_list(filepath, header=None, squeeze=True):
    """
        Read a CSV file into a list.

        Args:
            filepath (str): CSV file path
            header (int, list of int, optional): Row number(s) to use as the column names, and the start of the data. Defaults to None.
            squeeze (bool, optional): If the parsed data only contains one column then return a Series. Defaults to True.

        Returns:
            list: list of values from CSV file
        """
    return list(pd.read_csv(filepath, header=None, squeeze=True))

def modify_observations(df,index,columns,values,modify_action='update_values'):
  #assert - index, columns and values are string list type, 
  #columns and values are same size, for single column - value should be scalar
  #columns that have modification exist in observation_df
  #
  if modify_action=='add_row':
    df.loc[index]=values

  elif modify_action=='update_values':
    df.loc[index,columns]=values
  
  return df

def missing_statistics(df,column):
  return (df.shape[0]-df[column].count())*100/df.shape[0]

def cardinality_statistics(df,column):
  return (df.shape[0]-len(df[column].unique()))*100/df.shape[0]

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class DropRowsTransformer(BaseEstimator, TransformerMixin):
    """
    A class to drop rows from a DataFrame.

    Args:
        row_index (pandas index object) : A list of indexes that should be dropped from the DataFrame.
        inplace : x (default=True)
        reset_index : binary (default=True)
            Whether reindexing should be performed after drop action
    """

    def __init__(self, row_index, inplace, reset_index):
        """
        Constructs all the necessary attributes for the DropRowsTransformer object.

        Args:
            row_index : pandas index object
                A list of indexes that should be dropped from the DataFrame.
            inplace : binary (default=True)
                Whether the action should be performed inplace or not
            reset_index : binary (default=True)
                Whether reindexing should be performed after drop action
        """
        self.row_index = row_index
        self.inplace = True
        self.reset_index = True

    def fit(self, X, y=None):
        """
        Fit the class on input dataframe

        Args:
            X (pandas DataFrame): input dataframe
            y : place holder, defaulted to None
        """
        return self

    def transform(self, X, y=None):
        """
        Apply transforms on the input dataframe

        Args:
            X (pandas DataFrame): input dataframe
            y : place holder, defaulted to None

        Returns:
            X : Transformed dataframe
        """
        X.drop(index=self.row_index, inplace=self.inplace)
        if self.reset_index:
            X.reset_index(inplace=True)
        return X

In [6]:
class BuildFeaturesTransformer(BaseEstimator, TransformerMixin):
    """
    A class to build new features. 

    Args:
        input_columns (array or list) : The columns that will be used as input for building new features.

    Returns:
        DataFrame : Transformed dataframe with new features added in as columns
    """

    def __init__(self, input_columns):
        """
        Constructs all the necessary attributes for the BuildFeaturesTransformer object.

        Args:
            input_columns (array or list) : The columns that will be used as input for building new features.
        """
        self.input_columns = input_columns

    def date_diff(self, date1, date2):
        """
        Returns the difference between two input dates as timedelta.

        Args:
            date1 (datetime): A date
            date2 (datetime): Another date

        Returns:
            date_difference (timedelta): difference between date1 and date2
        """
        date_difference = date1-date2
        return date_difference

    def is_usa(self, country):
        """
        Checks whether country is 'UNITED STATES OF AMERICA' or not and returns a binary flag

        Args:
            country (str): country

        Returns:
            USA_YN (str): binary flag based on country value
        """
        if country == 'UNITED STATES OF AMERICA':
            USA_YN = 'Y'
        else:
            USA_YN = 'N'
        return USA_YN

    def fit(self, X, y=None):
        """
        Fit the class on input dataframe

        Args:
            X (pandas DataFrame): input dataframe
            y : place holder, defaulted to None
        """
        return self

    def transform(self, X, y=None):
        """
        Apply transforms on the input dataframe to build new features

        Args:
            X (pandas DataFrame): input dataframe
            y : place holder, defaulted to None

        Returns:
            X : Transformed dataframe with new features added in as columns
        """
        # Processing_Days and Validity_days
        X['PROCESSING_DAYS'] = self.date_diff(X.DECISION_DATE, X.RECEIVED_DATE).dt.days
        X['VALIDITY_DAYS'] = self.date_diff(X.END_DATE, X.BEGIN_DATE).dt.days

        # SOC_Codes
        X['SOC_CD2'] = X.SOC_CODE.str.split(pat='-', n=1, expand=True)[0]
        X['SOC_CD4'] = X.SOC_CODE.str.split(pat='-', n=1, expand=True)[1].str.split(pat='.', n=1, expand=True)[0]
        X['SOC_CD_ONET'] = X.SOC_CODE.str.split(pat='-', n=1, expand=True)[1].str.split(pat='.', n=1, expand=True)[1]

        # USA_YN
        X['USA_YN'] = X.EMPLOYER_COUNTRY.apply(self.is_usa)

        # Employer_Worksite_YN
        X['EMPLOYER_WORKSITE_YN'] = 'Y'
        X.loc[X.EMPLOYER_POSTAL_CODE.ne(X.WORKSITE_POSTAL_CODE), 'EMPLOYER_WORKSITE_YN'] = 'N'

        # OES_YN
        X['OES_YN'] = 'Y'
        X.iloc[X[~X.PW_OTHER_SOURCE.isna()].index,X.columns.get_loc('OES_YN')] = 'N'

        # SURVEY_YEAR
        X['SURVEY_YEAR'] = pd.to_datetime(X.PW_OES_YEAR.str.split(pat='-', n=1, expand=True)[0]).dt.to_period('Y')
        pw_other_year = X[X.OES_YN == 'N'].PW_OTHER_YEAR
        #Rename the series and update dataframe with series object
        pw_other_year.rename("SURVEY_YEAR", inplace=True)
        X.update(pw_other_year)

        # WAGE_ABOVE_PREVAILING_HR
        X['WAGE_PER_HR'] = X.WAGE_RATE_OF_PAY_FROM
        #compute for Year
        X.iloc[X[X.WAGE_UNIT_OF_PAY == 'Year'].index, X.columns.get_loc('WAGE_PER_HR')] = X[X.WAGE_UNIT_OF_PAY == 'Year'].WAGE_RATE_OF_PAY_FROM/2067
        #compute for Month
        X.iloc[X[X.WAGE_UNIT_OF_PAY == 'Month'].index, X.columns.get_loc('WAGE_PER_HR')] = X[X.WAGE_UNIT_OF_PAY == 'Month'].WAGE_RATE_OF_PAY_FROM/172

        #initialize with WAGE_RATE_OF_PAY_FROM
        X['PW_WAGE_PER_HR'] = X.PREVAILING_WAGE
        #compute for Year
        X.iloc[X[X.PW_UNIT_OF_PAY == 'Year'].index, X.columns.get_loc('PW_WAGE_PER_HR')] = X[X.PW_UNIT_OF_PAY == 'Year'].PREVAILING_WAGE/2067
        #compute for Month
        X.iloc[X[X.PW_UNIT_OF_PAY == 'Month'].index, X.columns.get_loc('PW_WAGE_PER_HR')] = X[X.PW_UNIT_OF_PAY == 'Month'].PREVAILING_WAGE/172

        X['WAGE_ABOVE_PW_HR'] = X.WAGE_PER_HR-X.PW_WAGE_PER_HR

        return X


In [130]:
#load data
#required_features=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/required_features.csv',header=None,squeeze=True)
input_df=pd.read_excel('https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/LCA_Disclosure_Data_FY2020_Q2.xlsx')

In [131]:
#build features
fe_cols=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/feature_engineering_columns.csv',header=None,squeeze=True)
drop_row_index=input_df[~input_df.CASE_STATUS.isin(['Certified','Denied'])].index


In [132]:
from sklearn.pipeline import Pipeline, make_pipeline
#Build preprocessing pipeline
build_feature_pipe=make_pipeline(
    DropRowsTransformer(row_index=drop_row_index,inplace=True,reset_index=True),
    BuildFeaturesTransformer(fe_cols)
)

In [133]:
transformed_df=build_feature_pipe.transform(input_df)

In [134]:
observations_df=pd.DataFrame(data=None,
                            index=transformed_df.columns,
                            columns=['Dtype','percent_missing','cardinality','preprocess_action','preprocess_comment','new_feature_name','new_feature_logic','categorical_class','embedding']
)

In [135]:
for column in transformed_df.columns:
  #Fill in Dtype, missing and cardinality statistics
  observations_df=modify_observations(df=observations_df,
                                      index=column,
                                      columns=['Dtype','percent_missing','cardinality'],
                                      values=[transformed_df[column].dtype,missing_statistics(transformed_df,column),cardinality_statistics(transformed_df,column)],
                                      modify_action='update_values')


In [136]:
#drop features with missing values >threshold
missing_threshold=40.0
for idx in observations_df[observations_df.percent_missing>=missing_threshold].index:
  observations_df=modify_observations(df=observations_df,
                                      index=idx,
                                      columns=['preprocess_action','preprocess_comment'],
                                      values=['Drop column','missing values>='+str(missing_threshold)+'% of total'],
                                      modify_action='update_values')
  #observations_df.loc[[idx],['preprocess_action','preprocess_comment']]=['Drop column','missing values>='+str(missing_threshold)+'% of total']

In [137]:
#drop features with cardinality>threshold
cardinality_threshold=80.0
for idx in observations_df[observations_df.cardinality<80.0].index:
  observations_df=modify_observations(df=observations_df,
                                      index=idx,
                                      columns=['preprocess_action','preprocess_comment'],
                                      values=['Drop column','High Cardinality, threshold '+str(cardinality_threshold)+'% of total'],
                                      modify_action='update_values')

In [138]:
#Separate target column
observations_df=modify_observations(df=observations_df,
                                    index='CASE_STATUS',
                                    columns=['preprocess_action','preprocess_comment'],
                                    values=['Pop column into a separate list','Target feature'],
                                    modify_action='update_values')
#df_data_statistics.loc[['CASE_STATUS'],['preprocess_action','preprocess_comment']]=['Pop column into a separate list','Target feature']


In [139]:
#FEATURE Engineering - date columns
#Create a new feature - PROCESSING_DAYS from 'RECEIVED_DATE', 'DECISION_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['RECEIVED_DATE', 'DECISION_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','PROCESSING_DAYS','days(DECISION_DATE-RECEIVED_DATE)'],
                                    modify_action='update_values')
#df_data_statistics.loc[['RECEIVED_DATE', 'DECISION_DATE'],['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic']]=['Drop column','Feature engineering','PROCESSING_DAYS','days(DECISION_DATE-RECEIVED_DATE)']
#Create a new feature - VALIDITY_DAYS from 'BEGIN_DATE', 'END_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['BEGIN_DATE', 'END_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','VALIDITY_DAYS','days(END_DATE-BEGIN_DATE)'],
                                    modify_action='update_values')
#df_data_statistics.loc[['BEGIN_DATE', 'END_DATE'],['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic']]=['Drop column','Feature engineering','VALIDITY_DAYS','days(END_DATE-BEGIN_DATE)']


In [140]:
#Feature engineering - split SOC_CODE into 2 new features - SOC_CODE_2, SOC_CODE_4
observations_df=modify_observations(df=observations_df,
                                    index='SOC_CODE',
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','SOC_CODE_2,SOC_CODE_4','SOC_CODE.split(\'-\')'],
                                    modify_action='update_values')
#df_data_statistics.loc[['SOC_CODE'],['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic']]=['Drop column','Feature engineering','SOC_CODE_2,SOC_CODE_4','SOC_CODE.split(\'-\')']

In [158]:
#Feature engineering - EMPLOYER_COUNTRY - US or NOT
observations_df=modify_observations(df=observations_df,
                                    index='EMPLOYER_COUNTRY',
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','USA_YN','IF EMPLOYER_COUNTRY==USA THEN Y ELSE N END'],
                                    modify_action='update_values')
#df_data_statistics.loc[['EMPLOYER_COUNTRY'],['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic']]=['Drop column','Feature engineering','USA_YN','IF EMPLOYER_COUNTRY==USA THEN Y ELSE N END']

In [142]:
#Drop columns - EMPLOYER_* except 'EMPLOYER_NAME',EMPLOYER_POSTAL_CODE
not_useful_cols=['TRADE_NAME_DBA','EMPLOYER_ADDRESS1','EMPLOYER_ADDRESS2','EMPLOYER_CITY','EMPLOYER_STATE',
          'EMPLOYER_PROVINCE','EMPLOYER_PHONE','EMPLOYER_PHONE_EXT','EMPLOYER_POC_LAST_NAME',
          'EMPLOYER_POC_FIRST_NAME','EMPLOYER_POC_MIDDLE_NAME','EMPLOYER_POC_JOB_TITLE','EMPLOYER_POC_ADDRESS1',
          'EMPLOYER_POC_ADDRESS2','EMPLOYER_POC_CITY','EMPLOYER_POC_STATE','EMPLOYER_POC_POSTAL_CODE',
          'EMPLOYER_POC_COUNTRY','EMPLOYER_POC_PROVINCE','EMPLOYER_POC_PHONE','EMPLOYER_POC_PHONE_EXT','EMPLOYER_POC_EMAIL',
          'AGENT_ATTORNEY_LAST_NAME','AGENT_ATTORNEY_FIRST_NAME','AGENT_ATTORNEY_MIDDLE_NAME','AGENT_ATTORNEY_ADDRESS1',
          'AGENT_ATTORNEY_ADDRESS2','AGENT_ATTORNEY_CITY','AGENT_ATTORNEY_STATE','AGENT_ATTORNEY_POSTAL_CODE',
          'AGENT_ATTORNEY_COUNTRY','AGENT_ATTORNEY_PROVINCE','AGENT_ATTORNEY_PHONE','AGENT_ATTORNEY_PHONE_EXT',
          'AGENT_ATTORNEY_EMAIL_ADDRESS','LAWFIRM_NAME_BUSINESS_NAME','STATE_OF_HIGHEST_COURT','NAME_OF_HIGHEST_STATE_COURT','SECONDARY_ENTITY_BUSINESS_NAME',
          'WORKSITE_ADDRESS1','WORKSITE_ADDRESS2','WORKSITE_CITY','WORKSITE_COUNTY','WORKSITE_STATE','WAGE_UNIT_OF_PAY','PW_UNIT_OF_PAY','APPENDIX_A_ATTACHED','STATUTORY_BASIS']
observations_df=modify_observations(df=observations_df,
                                    index=not_useful_cols,
                                    columns=['preprocess_action','preprocess_comment'],
                                    values=['Drop column','Not Useful'],
                                    modify_action='update_values')

In [143]:
#Feature engineering - Worksite same as employer address 
observations_df=modify_observations(df=observations_df,
                                    index=['WORKSITE_POSTAL_CODE','EMPLOYER_POSTAL_CODE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','EMPLOYER_WORKSITE_YN','IF EMPLOYER_POSTAL_CODE==WORKSITE_POSTAL_CODE THEN Y ELSE N END'],
                                    modify_action='update_values')
#df_data_statistics.loc[['WORKSITE_POSTAL_CODE'],['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic']]=['Drop column','Feature engineering','EMPLOYER_WORKSITE_YN','IF EMPLOYER_POSTAL_CODE==WORKSITE_POSTAL_CODE THEN Y ELSE N END']


In [144]:

#Feature engineering - convert PREVAILING_WAGE and WAGE_RATE_OF_PAY_FROM to hourly wage - if PW_UNIT_OF_PAY=Hour ignore, if Month then WAGE/172, if Year then WAGE/2067
#Feature engineering - WAGE_ABOVE_PREVAILING_HR = WAGE_RATE_OF_PAY_FROM_HR-PREVAILING_WAGE_HR
observations_df=modify_observations(df=observations_df,
                                    index=['PREVAILING_WAGE','PW_UNIT_OF_PAY'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','PREVAILING_WAGE_HR;WAGE_ABOVE_PREVAILING_HR','if PW_UNIT_OF_PAY=Hour ignore, if Month then WAGE/172, if Year then WAGE/2067;WAGE_RATE_OF_PAY_FROM_HR-PREVAILING_WAGE_HR'],
                                    modify_action='update_values')
#df_data_statistics.loc[['PREVAILING_WAGE'],['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic']]=['Drop column','Feature engineering','PREVAILING_WAGE_HR;WAGE_ABOVE_PREVAILING_HR','if PW_UNIT_OF_PAY=Hour ignore, if Month then WAGE/172, if Year then WAGE/2067;WAGE_RATE_OF_PAY_FROM_HR-PREVAILING_WAGE_HR']

observations_df=modify_observations(df=observations_df,
                                    index=['WAGE_RATE_OF_PAY_FROM','WAGE_UNIT_OF_PAY'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','WAGE_RATE_OF_PAY_FROM_HR;WAGE_ABOVE_PREVAILING_HR','if WAGE_UNIT_OF_PAY=Hour ignore, if Month then WAGE/172, if Year then WAGE/2067;WAGE_RATE_OF_PAY_FROM_HR-PREVAILING_WAGE_HR'],
                                    modify_action='update_values')
#df_data_statistics.loc[['WAGE_RATE_OF_PAY_FROM'],['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic']]=['Drop column','Feature engineering','WAGE_RATE_OF_PAY_FROM_HR;WAGE_ABOVE_PREVAILING_HR','if WAGE_UNIT_OF_PAY=Hour ignore, if Month then WAGE/172, if Year then WAGE/2067;WAGE_RATE_OF_PAY_FROM_HR-PREVAILING_WAGE_HR']


In [145]:
#Feature engineering - OES_YN - if 'PW_OTHER_SOURCE' is not NaN then N else Y
observations_df=modify_observations(df=observations_df,
                                    index='PW_OTHER_SOURCE',
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','OES_YN ','if PW_OTHER_SOURCE is not NaN then N else Y'],
                                    modify_action='update_values')
#df_data_statistics.loc[['PW_OTHER_SOURCE'],['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic']]=['Drop column','Feature engineering','OES_YN ','if PW_OTHER_SOURCE is not NaN then N else Y']
#Feature engineering - SURVEY_YEAR - if OES_YN ==Y then extract year from first date of PW_OES_YEAR' else 'PW_OTHER_YEAR'
observations_df=modify_observations(df=observations_df,
                                    index=['PW_OES_YEAR','PW_OTHER_YEAR'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','SURVEY_YEAR ','if OES_YN ==Y then extract year from first date of PW_OES_YEAR else PW_OTHER_YEAR'],
                                    modify_action='update_values')
#df_data_statistics.loc[['PW_OES_YEAR','PW_OTHER_YEAR'],['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic']]=['Drop column','Feature engineering','SURVEY_YEAR ','if OES_YN ==Y then extract year from first date of PW_OES_YEAR else PW_OTHER_YEAR']


In [146]:
#Categorical columns 
cat_cols=['CASE_STATUS','VISA_CLASS','SOC_CODE','SOC_TITLE','EMPLOYER_NAME','EMPLOYER_POSTAL_CODE','WORKSITE_POSTAL_CODE','PW_OTHER_SOURCE','PUBLIC_DISCLOSURE','NAICS_CODE','EMPLOYER_NAME']
observations_df=modify_observations(df=observations_df,
                                    index=cat_cols,
                                    columns=['categorical_class', 'embedding'],
                                    values=['Categorical','Standardized random'],
                                    modify_action='update_values')
#df_data_statistics.loc[cat_cols,['categorical_class', 'embedding']]=['Categorical','Standardized random']

In [147]:
#Ordinal columns
ord_cols=['PW_WAGE_LEVEL','PW_OES_YEAR']
observations_df=modify_observations(df=observations_df,
                                    index=ord_cols,
                                    columns=['categorical_class', 'embedding'],
                                    values=['Ordinal','Ordered standardized random'],
                                    modify_action='update_values')
#df_data_statistics.loc[ord_cols,['categorical_class', 'embedding']]=['Ordinal','Ordered standardized random']


In [148]:
#binary columns
binary_cols=['FULL_TIME_POSITION','AGENT_REPRESENTING_EMPLOYER','SECONDARY_ENTITY','AGREE_TO_LC_STATEMENT','H-1B_DEPENDENT','WILLFUL_VIOLATOR','EMPLOYER_COUNTRY']
observations_df=modify_observations(df=observations_df,
                                    index=binary_cols,
                                    columns=['categorical_class', 'embedding'],
                                    values=['Binary','Standardized random'],
                                    modify_action='update_values')
#df_data_statistics.loc[binary_cols,['categorical_class', 'embedding']]=['Binary','Standardized random']


In [149]:
numeric_cols=['TOTAL_WORKER_POSITIONS', 'NEW_EMPLOYMENT', 'CONTINUED_EMPLOYMENT','CHANGE_PREVIOUS_EMPLOYMENT', 'NEW_CONCURRENT_EMPLOYMENT','CHANGE_EMPLOYER', 'AMENDED_PETITION', 'WORKSITE_WORKERS','TOTAL_WORKSITE_LOCATIONS']
observations_df=modify_observations(df=observations_df,
                                    index=numeric_cols,
                                    columns=['categorical_class', 'embedding'],
                                    values=['Numerical','Standard scaling'],
                                    modify_action='update_values')


In [150]:
#Update details for new features - Numeric
observations_df=modify_observations(observations_df,
                                    index=['PROCESSING_DAYS','VALIDITY_DAYS','WAGE_ABOVE_PW_HR'],
                                    columns=['preprocess_action','preprocess_comment','categorical_class','embedding'],
                                    values=['New feature','Feature engineering','Numerical','Standard scaling'],
                                    modify_action='update_values')

In [151]:
##Update details for new features - Categorical
observations_df=modify_observations(observations_df,
                                    index=['SOC_CD2','SOC_CD4','SOC_CD_ONET'],
                                    columns=['preprocess_action','preprocess_comment','categorical_class','embedding'],
                                    values=['New feature','Feature engineering','Categorical','Standardized random'],
                                    modify_action='update_values')

In [152]:
##Update details for new features - Binary
observations_df=modify_observations(observations_df,
                                    index=['USA_YN','EMPLOYER_WORKSITE_YN','OES_YN'],
                                    columns=['preprocess_action','preprocess_comment','categorical_class','embedding'],
                                    values=['New feature','Feature engineering','Binary','Standardized random'],
                                    modify_action='update_values')

In [153]:
##Update details for new features - Ordinal
observations_df=modify_observations(observations_df,
                                    index=['SURVEY_YEAR'],
                                    columns=['preprocess_action','preprocess_comment','categorical_class','embedding'],
                                    values=['New feature','Feature engineering','Ordinal','Ordered standardized random'],
                                    modify_action='update_values')

In [154]:
observations_df=modify_observations(observations_df,
                                    index=['VISA_CLASS', 'SOC_TITLE', 'FULL_TIME_POSITION','TOTAL_WORKER_POSITIONS', 'NEW_EMPLOYMENT', 'CONTINUED_EMPLOYMENT','CHANGE_PREVIOUS_EMPLOYMENT', 'NEW_CONCURRENT_EMPLOYMENT','CHANGE_EMPLOYER', 'AMENDED_PETITION', 'EMPLOYER_NAME', 'NAICS_CODE','AGENT_REPRESENTING_EMPLOYER', 'WORKSITE_WORKERS', 'SECONDARY_ENTITY','PW_WAGE_LEVEL', 'TOTAL_WORKSITE_LOCATIONS', 'AGREE_TO_LC_STATEMENT','H-1B_DEPENDENT', 'WILLFUL_VIOLATOR', 'PUBLIC_DISCLOSURE'],
                                    columns=['preprocess_action','preprocess_comment'],
                                    values=['Use feature as is','Use feature as is'],
                                    modify_action='update_values')

In [155]:
observations_df

Unnamed: 0,Dtype,percent_missing,cardinality,preprocess_action,preprocess_comment,new_feature_name,new_feature_logic,categorical_class,embedding
index,int64,0,0,Drop column,"High Cardinality, threshold 80.0% of total",,,,
CASE_NUMBER,object,0,0,Drop column,"High Cardinality, threshold 80.0% of total",,,,
CASE_STATUS,object,0,99.9987,Pop column into a separate list,Target feature,,,Categorical,Standardized random
RECEIVED_DATE,datetime64[ns],0,99.9352,Drop column,Feature engineering,PROCESSING_DAYS,days(DECISION_DATE-RECEIVED_DATE),,
DECISION_DATE,datetime64[ns],0,99.9563,Drop column,Feature engineering,PROCESSING_DAYS,days(DECISION_DATE-RECEIVED_DATE),,
...,...,...,...,...,...,...,...,...,...
OES_YN,object,0,99.9987,New feature,Feature engineering,,,Binary,Standardized random
SURVEY_YEAR,object,0.199739,99.9934,New feature,Feature engineering,,,Ordinal,Ordered standardized random
WAGE_PER_HR,float64,0,83.585,,,,,,
PW_WAGE_PER_HR,float64,0,91.9985,,,,,,


In [None]:
observations_df.to_csv('/content/drive/MyDrive/final_observations.csv',sep='$')

In [172]:
required_features=list(observations_df[(observations_df.preprocess_comment.isin(['Feature engineering','Target feature','Use feature as is'])) & (~observations_df.preprocess_action.isin(['New feature']))].index)

In [174]:
set(required_features)-set(observations_df[observations_df.preprocess_action.isin([np.NaN,'New feature','Use feature as is'])].index.values)

{'BEGIN_DATE',
 'CASE_STATUS',
 'DECISION_DATE',
 'EMPLOYER_COUNTRY',
 'EMPLOYER_POSTAL_CODE',
 'END_DATE',
 'PREVAILING_WAGE',
 'PW_OES_YEAR',
 'PW_OTHER_SOURCE',
 'PW_OTHER_YEAR',
 'PW_UNIT_OF_PAY',
 'RECEIVED_DATE',
 'SOC_CODE',
 'WAGE_RATE_OF_PAY_FROM',
 'WAGE_UNIT_OF_PAY',
 'WORKSITE_POSTAL_CODE'}