<a href="https://colab.research.google.com/github/sharsulkar/H1B_LCA_outcome_prediction/blob/main/prototyping/notebooks/PERM/01_sh_PERM_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exploratory data analysis of PERM data for FY20
Sourced from https://www.dol.gov/agencies/eta/foreign-labor/performance  
Record layout metadata can be found at https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/PERM_Record_Layout_FY2020.pdf  

Expected outcome of EDA is -  
1. Identify important features most relevant to the classification problem  
2. Identify feature engineering opportunities  
3. Identify categorical features and decide encoding method  
4. Identify numeric features and decide encoding method  

In [2]:
import numpy as np
import pandas as pd
import time


In [7]:
def read_csv_to_list(filepath, header=None, squeeze=True):
    """
        Read a CSV file into a list.

        Args:
            filepath (str): CSV file path
            header (int, list of int, optional): Row number(s) to use as the column names, and the start of the data. Defaults to None.
            squeeze (bool, optional): If the parsed data only contains one column then return a Series. Defaults to True.

        Returns:
            list: list of values from CSV file
        """
    return list(pd.read_csv(filepath, header=None, squeeze=True))

def modify_observations(df,index,columns,values,modify_action='update_values'):
  #assert - index, columns and values are string list type, 
  #columns and values are same size, for single column - value should be scalar
  #columns that have modification exist in observation_df
  #
  if modify_action=='add_row':
    df.loc[index]=values

  elif modify_action=='update_values':
    df.loc[index,columns]=values
  
  return df

def missing_statistics(df,column):
  return (df.shape[0]-df[column].count())*100/df.shape[0]

def cardinality_statistics(df,column):
  return (df.shape[0]-len(df[column].unique()))*100/df.shape[0]

In [34]:
from sklearn.base import BaseEstimator, TransformerMixin

class DropRowsTransformer(BaseEstimator, TransformerMixin):
    """
    A class to drop rows from a DataFrame.

    Args:
        row_index (pandas index object) : A list of indexes that should be dropped from the DataFrame.
        inplace : x (default=True)
        reset_index : binary (default=True)
            Whether reindexing should be performed after drop action
    """

    def __init__(self, row_index, inplace, reset_index):
        """
        Constructs all the necessary attributes for the DropRowsTransformer object.

        Args:
            row_index : pandas index object
                A list of indexes that should be dropped from the DataFrame.
            inplace : binary (default=True)
                Whether the action should be performed inplace or not
            reset_index : binary (default=True)
                Whether reindexing should be performed after drop action
        """
        self.row_index = row_index
        self.inplace = True
        self.reset_index = True

    def fit(self, X, y=None):
        """
        Fit the class on input dataframe

        Args:
            X (pandas DataFrame): input dataframe
            y : place holder, defaulted to None
        """
        return self

    def transform(self, X, y=None):
        """
        Apply transforms on the input dataframe

        Args:
            X (pandas DataFrame): input dataframe
            y : place holder, defaulted to None

        Returns:
            X : Transformed dataframe
        """
        X.drop(index=self.row_index, inplace=self.inplace)
        if self.reset_index:
            X.reset_index(inplace=True,drop=True)
        return X

In [123]:
class BuildFeaturesTransformer(BaseEstimator, TransformerMixin):
    """
    A class to build new features. 

    Args:
        input_columns (array or list) : The columns that will be used as input for building new features.

    Returns:
        DataFrame : Transformed dataframe with new features added in as columns
    """

    def __init__(self, input_columns):
        """
        Constructs all the necessary attributes for the BuildFeaturesTransformer object.

        Args:
            input_columns (array or list) : The columns that will be used as input for building new features.
        """
        self.input_columns = input_columns

    def date_diff(self, date1, date2):
        """
        Returns the difference between two input dates as timedelta.

        Args:
            date1 (datetime): A date
            date2 (datetime): Another date

        Returns:
            date_difference (timedelta): difference between date1 and date2
        """
        date_difference = date1-date2
        return date_difference

    def is_usa(self, country):
        """
        Checks whether country is 'UNITED STATES OF AMERICA' or not and returns a binary flag

        Args:
            country (str): country

        Returns:
            USA_YN (str): binary flag based on country value
        """
        if country == 'UNITED STATES OF AMERICA':
            USA_YN = 'Y'
        else:
            USA_YN = 'N'
        return USA_YN

    def fit(self, X, y=None):
        """
        Fit the class on input dataframe

        Args:
            X (pandas DataFrame): input dataframe
            y : place holder, defaulted to None
        """
        return self

    def transform(self, X, y=None):
        """
        Apply transforms on the input dataframe to build new features

        Args:
            X (pandas DataFrame): input dataframe
            y : place holder, defaulted to None

        Returns:
            X : Transformed dataframe with new features added in as columns
        """
        # Processing_Days
        X['PROCESSING_DAYS'] = self.date_diff(X.DECISION_DATE, X.RECEIVED_DATE).dt.days
        
        # Employer_Worksite_YN
        X['EMPLOYER_WORKSITE_YN'] = 'Y'
        X.loc[X.EMPLOYER_POSTAL_CODE.ne(X.WORKSITE_POSTAL_CODE), 'EMPLOYER_WORKSITE_YN'] = 'N'

        # WAGE_ABOVE_PREVAILING_HR
        X['WAGE_PER_HR'] = X.WAGE_OFFER_FROM
        #compute for Year
        X.iloc[X[X.WAGE_OFFER_UNIT_OF_PAY == 'Year'].index, X.columns.get_loc('WAGE_PER_HR')] = X[X.WAGE_OFFER_UNIT_OF_PAY == 'Year'].WAGE_OFFER_FROM/2067
        #compute for Month
        X.iloc[X[X.WAGE_OFFER_UNIT_OF_PAY == 'Month'].index, X.columns.get_loc('WAGE_PER_HR')] = X[X.WAGE_OFFER_UNIT_OF_PAY == 'Month'].WAGE_OFFER_FROM/172
        #compute for Bi-weekly
        X.iloc[X[X.WAGE_OFFER_UNIT_OF_PAY == 'Bi-Weekly'].index, X.columns.get_loc('WAGE_PER_HR')] = X[X.WAGE_OFFER_UNIT_OF_PAY == 'Bi-Weekly'].WAGE_OFFER_FROM/80
        #compute for week
        X.iloc[X[X.WAGE_OFFER_UNIT_OF_PAY == 'Week'].index, X.columns.get_loc('WAGE_PER_HR')] = X[X.WAGE_OFFER_UNIT_OF_PAY == 'Week'].WAGE_OFFER_FROM/40

        #initialize with WAGE_RATE_OF_PAY_FROM
        X['PW_WAGE_PER_HR'] = X.PW_WAGE
        #compute for Year
        X.iloc[X[X.PW_UNIT_OF_PAY == 'Year'].index, X.columns.get_loc('PW_WAGE_PER_HR')] = X[X.PW_UNIT_OF_PAY == 'Year'].PW_WAGE/2067
        #compute for Month
        X.iloc[X[X.PW_UNIT_OF_PAY == 'Month'].index, X.columns.get_loc('PW_WAGE_PER_HR')] = X[X.PW_UNIT_OF_PAY == 'Month'].PW_WAGE/172
        #compute for Bi-weekly
        X.iloc[X[X.PW_UNIT_OF_PAY == 'Bi-Weekly'].index, X.columns.get_loc('PW_WAGE_PER_HR')] = X[X.PW_UNIT_OF_PAY == 'Bi-Weekly'].PW_WAGE/172
        #compute for Week
        X.iloc[X[X.PW_UNIT_OF_PAY == 'Week'].index, X.columns.get_loc('PW_WAGE_PER_HR')] = X[X.PW_UNIT_OF_PAY == 'Week'].PW_WAGE/172

        X['WAGE_ABOVE_PW_HR'] = X.WAGE_PER_HR-X.PW_WAGE_PER_HR

        #advertisement columns
        #NEWSPAPER_1
        X['NEWSPAPER_1'] = 'Y'
        X.iloc[X[X.FIRST_NEWSPAPER_NAME.isna()].index,X.columns.get_loc('NEWSPAPER_1')] = 'N'

        #NEWSPAPER_2
        X['NEWSPAPER_2'] = 'Y'
        X.iloc[X[X.SECOND_NEWSPAPER_AD_NAME.isna()].index,X.columns.get_loc('NEWSPAPER_2')] = 'N'

        #JOB_FAIR
        X['JOB_FAIR'] = 'Y'
        X.iloc[X[X.JOB_FAIR_FROM_DATE.isna()].index,X.columns.get_loc('JOB_FAIR')] = 'N'

        #ON_CAMPUS_RECRUITING
        X['ON_CAMPUS_RECRUITING'] = 'Y'
        X.iloc[X[X.ON_CAMPUS_RECRUITING_FROM_DATE.isna()].index,X.columns.get_loc('ON_CAMPUS_RECRUITING')] = 'N'

        #EMPLOYER_WEBSITE
        X['EMPLOYER_WEBSITE'] = 'Y'
        X.iloc[X[X.EMPLOYER_WEBSITE_FROM_DATE.isna()].index,X.columns.get_loc('EMPLOYER_WEBSITE')] = 'N'

        #PRO_ORG_AD
        X['PRO_ORG_AD'] = 'Y'
        X.iloc[X[X.PRO_ORG_AD_FROM_DATE.isna()].index,X.columns.get_loc('PRO_ORG_AD')] = 'N'

        #JOB_SEARCH_WEBSITE
        X['JOB_SEARCH_WEBSITE'] = 'Y'
        X.iloc[X[X.JOB_SEARCH_WEBSITE_FROM_DATE.isna()].index,X.columns.get_loc('JOB_SEARCH_WEBSITE')] = 'N'

        #PVT_EMPLOYMENT_FIRM
        X['PVT_EMPLOYMENT_FIRM'] = 'Y'
        X.iloc[X[X.PVT_EMPLOYMENT_FIRM_FROM_DATE.isna()].index,X.columns.get_loc('PVT_EMPLOYMENT_FIRM')] = 'N'

        #EMPLOYEE_REF_PROG
        X['EMPLOYEE_REF_PROG'] = 'Y'
        X.iloc[X[X.EMPLOYEE_REF_PROG_FROM_DATE.isna()].index,X.columns.get_loc('EMPLOYEE_REF_PROG')] = 'N'

        #CAMPUS_PLACEMENT
        X['CAMPUS_PLACEMENT'] = 'Y'
        X.iloc[X[X.CAMPUS_PLACEMENT_FROM_DATE.isna()].index,X.columns.get_loc('CAMPUS_PLACEMENT')] = 'N'

        #LOCAL_ETHNIC_PAPER
        X['LOCAL_ETHNIC_PAPER'] = 'Y'
        X.iloc[X[X.LOCAL_ETHNIC_PAPER_FROM_DATE.isna()].index,X.columns.get_loc('LOCAL_ETHNIC_PAPER')] = 'N'

        #RADIO_TV_AD
        X['RADIO_TV_AD'] = 'Y'
        X.iloc[X[X.RADIO_TV_AD_FROM_DATE.isna()].index,X.columns.get_loc('RADIO_TV_AD')] = 'N'


        return X


In [35]:
data_df=pd.read_excel('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2020.xlsx')
#test_df=pd.read_excel('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2021_Q1.xlsx')
datadf_copy=data_df.copy()

In [124]:
data_df=datadf_copy.copy()

In [125]:
from sklearn.pipeline import Pipeline, make_pipeline
#Build preprocessing pipeline
build_feature_pipe=make_pipeline(
    DropRowsTransformer(row_index=drop_row_index,inplace=True,reset_index=True),
    BuildFeaturesTransformer(fe_cols)
)

In [126]:
transformed_df=build_feature_pipe.transform(data_df)

In [127]:
transformed_df.head()

Unnamed: 0,CASE_NUMBER,CASE_STATUS,RECEIVED_DATE,DECISION_DATE,REFILE,ORIG_FILE_DATE,PREVIOUS_SWA_CASE_NUMBER_STATE,SCHD_A_SHEEPHERDER,EMPLOYER_NAME,EMPLOYER_ADDRESS_1,EMPLOYER_ADDRESS_2,EMPLOYER_CITY,EMPLOYER_STATE_PROVINCE,EMPLOYER_COUNTRY,EMPLOYER_POSTAL_CODE,EMPLOYER_PHONE,EMPLOYER_PHONE_EXT,EMPLOYER_NUM_EMPLOYEES,EMPLOYER_YEAR_COMMENCED_BUSINESS,NAICS_CODE,FW_OWNERSHIP_INTEREST,EMP_CONTACT_NAME,EMP_CONTACT_ADDRESS_1,EMP_CONTACT_ADDRESS_2,EMP_CONTACT_CITY,EMP_CONTACT_STATE_PROVINCE,EMP_CONTACT_COUNTRY,EMP_CONTACT_POSTAL_CODE,EMP_CONTACT_PHONE,EMP_CONTACT_EMAIL,AGENT_ATTORNEY_NAME,AGENT_ATTORNEY_FIRM_NAME,AGENT_ATTORNEY_PHONE,AGENT_ATTORNEY_PHONE_EXT,AGENT_ATTORNEY_ADDRESS_1,AGENT_ATTORNEY_ADDRESS_2,AGENT_ATTORNEY_CITY,AGENT_ATTORNEY_STATE_PROVINCE,AGENT_ATTORNEY_COUNTRY,AGENT_ATTORNEY_POSTAL_CODE,...,FOREIGN_WORKER_EDUCATION_OTHER,FOREIGN_WORKER_INFO_MAJOR,FOREIGN_WORKER_YRS_ED_COMP,FOREIGN_WORKER_INST_OF_ED,FOREIGN_WORKER_ED_INST_ADD_1,FOREIGN_WORKER_ED_INST_ADD_2,FOREIGN_WORKER_ED_INST_CITY,FOREIGN_WORKER_ED_INST_STATE_P,FOREIGN_WORKER_ED_INST_COUNTRY,FOREIGN_WORKER_ED_INST_POST_CD,FOREIGN_WORKER_TRAINING_COMP,FOREIGN_WORKER_REQ_EXPERIENCE,FOREIGN_WORKER_ALT_ED_EXP,FOREIGN_WORKER_ALT_OCC_EXP,FOREIGN_WORKER_EXP_WITH_EMPL,FOREIGN_WORKER_EMPL_PAY_FOR_ED,FOREIGN_WORKER_CURR_EMPLOYED,EMPLOYER_COMPLETED_APPLICATION,PREPARER_NAME,PREPARER_TITLE,PREPARER_EMAIL,EMP_INFO_DECL_NAME,EMP_INFO_DECL_TITLE,PROCESSING_DAYS,EMPLOYER_WORKSITE_YN,WAGE_PER_HR,PW_WAGE_PER_HR,WAGE_ABOVE_PW_HR,NEWSPAPER_1,NEWSPAPER_2,JOB_FAIR,ON_CAMPUS_RECRUITING,EMPLOYER_WEBSITE,PRO_ORG_AD,JOB_SEARCH_WEBSITE,PVT_EMPLOYMENT_FIRM,EMPLOYEE_REF_PROG,CAMPUS_PLACEMENT,LOCAL_ETHNIC_PAPER,RADIO_TV_AD
0,A-09245-62715,Denied,2009-09-03,2020-04-30 14:50:29,N,NaT,,N,SATURN ENTERPRISES INC,2340 N WASHTENAW AVE,,CHICAGO,ILLINOIS,UNITED STATES OF AMERICA,60647,7732769550,,2.0,1998.0,23611.0,N,zygmunt f oparowski,2340 n washtenaw ave,,chicago,ILLINOIS,UNITED STATES OF AMERICA,60647,7732769550,saturn_enterprises@yahoo.com,Christopher E Kurczaba,Kurczaba Law Offices,7737740011,,6219 N Milwaukee Ave,,Chicago,ILLINOIS,UNITED STATES OF AMERICA,60646,...,,GENERAL,1996.0,LICEUM EKONOMICZNE,UL. JANA GOETZA,,BRZESKO,,POLAND,32-800,,Y,,,N,N,N,N,Christopher E Kurczaba,Attorney,kurczabalaw@sbcglobal.net,Zygmunt Oparowski,President,3892,Y,14.49,14.49,0.0,Y,Y,N,N,N,N,N,N,N,N,N,N
1,A-10070-89932,Certified,2019-10-22,2020-09-04 10:06:40,N,NaT,,N,SUNRAY ENTERPRISE INC.,3621 VININGS SLOPE SUITE#4310,,ATLANTA,GEORGIA,UNITED STATES OF AMERICA,30339,678-584-1312,224.0,45.0,2002.0,541512.0,N,Sunitha Shivaram,3621 Vinings Slope Suite#4310,,Atlanta,GEORGIA,UNITED STATES OF AMERICA,30339,678-584-1312 224,sunithas@sunraycorp.com,James E McLaughlin III,Murthy Law Firm,4103565440,,10451 Mill Run Circle,Suite 100,Owings Mills,MARYLAND,UNITED STATES OF AMERICA,21117,...,,,,,,,,,,,,,,Y,,N,Y,N,James E McLaughlin III,Attorney,perm@murthy.com,Sunitha Shivaram,CEO,318,Y,56.603774,56.603774,0.0,Y,Y,N,N,Y,N,Y,N,N,N,Y,N
2,A-10225-13679,Denied,2010-08-10,2020-04-30 07:55:08,N,NaT,,N,GLENNS PLUMBING SERVICES,854 CUSTER STREET,,N. VALLEY STREAM,NEW YORK,UNITED STATES OF AMERICA,11580,516-561-3692,,3.0,2002.0,,N,DAVID GLENN,854 CUSTER STREET,,N. VALLEY STREAM,NEW YORK,UNITED STATES OF AMERICA,11580,516-561-3692,,DANIEL J SULLIVAN,"DANIEL J. SULLIVAN, ESQ.",631-732-3516,,815 HORSEBLOCK ROAD,,FARMINGVILLE,NEW YORK,UNITED STATES OF AMERICA,11738,...,,,,,,,,,,,N,Y,N,N,N,N,N,N,DANIEL J SULLIVAN,ATTORNEY,GWG8@AOL.COM,DAVID GLENN,OWNER,3551,Y,38.0,38.047896,-0.047896,Y,Y,N,N,N,N,N,N,N,N,N,N
3,A-10334-34885,Certified,2010-12-02,2020-08-24 13:12:51,N,NaT,,N,"ASTIR IT SOLUTIONS, INC","50 CRAGWOOD ROAD, SUITE 219",,SOUTH PLAINFIELD,NEW JERSEY,UNITED STATES OF AMERICA,7080,908.279.8670,704.0,90.0,2001.0,541511.0,N,Kishore Ganji,"50 CRAGWOOD ROAD, SUITE 219",,SOUTH PLAINFIELD,NEW JERSEY New Jersey,UNITED STATES OF AMERICA,7080,908.279.8670 703,kishoreg@astirit.com,Kavitha Ramasami,Law Offices of Kavitha Ramasami,7323939557,,"450 Seventh Avenue, Suite 2600",,New York,NEW YORK,UNITED STATES OF AMERICA,10123,...,,ENGINEERING TECHNOLOGY,2004.0,PITTSBURG STATE UNIVERSITY,1701 S BROADWAY,,PITTSBURG,KS,UNITED STATES OF AMERICA,66762,,Y,,,N,N,Y,N,Kavitha Ramasami,Attorney at Law,kavitha@usimmigrationesq.com,Kishore Ganji,President and CEO,3553,Y,35.316884,35.159652,0.157233,Y,Y,N,N,Y,N,Y,N,N,N,Y,N
4,A-11033-51337,Denied,2011-01-31,2020-08-11 14:45:13,Y,2004-06-23,D-05194-21256,N,"DURANT HARVESTING, INC.",621 SOUTH SMITH STREET,"MAIL TO P.O. BOX 1370 SANTA MARIA, CA 93456",SANTA MARIA,CALIFORNIA,UNITED STATES OF AMERICA,93458,(805) 349-2820,,250.0,2001.0,115113.0,N,TOM DURANT,621 SOUTH SMITH STREET,"MAIL TO P.O. BOX 1370 SANTA MARIA, CA 93456",SANTA MARIA,CALIFORNIA,UNITED STATES OF AMERICA,93458,(805) 349-2820,,KEVIN M TRACY,LAW OFFICE OF KEVIN M. TRACY NORTH COUNTY LE...,(858) 481-0822,,"2010 JIMMY DURANTE BLVD., SUITE 126",,DEL MAR,CALIFORNIA,UNITED STATES OF AMERICA,92014,...,,,,,,,,,,,,Y,,,N,N,Y,N,KEVIN M TRACY,"ATTORNEY AT LAW, LAW OFFICE OF KEVIN M. TRACY AND",SJEANNETTE@NCLS.NET,TOM DURANT,PRESIDENT,3480,Y,13.423803,13.423803,0.0,Y,Y,N,N,N,N,N,N,N,N,N,N


In [128]:
#initialize observations_df where all EDA observations will be stored for future reference
observations_df=pd.DataFrame(data=None,
                            index=data_df.columns,
                            columns=['Dtype','percent_missing','cardinality','preprocess_action','preprocess_comment','new_feature_name','new_feature_logic','categorical_class','embedding']
)

In [129]:
for column in data_df.columns:
  #Fill in Dtype, missing and cardinality statistics
  observations_df=modify_observations(df=observations_df,
                                      index=column,
                                      columns=['Dtype','percent_missing','cardinality'],
                                      values=[data_df[column].dtype,missing_statistics(data_df,column),cardinality_statistics(data_df,column)],
                                      modify_action='update_values')

In [130]:
#drop features with missing values >threshold
missing_threshold=40.0
for idx in observations_df[observations_df.percent_missing>=missing_threshold].index:
  observations_df=modify_observations(df=observations_df,
                                      index=idx,
                                      columns=['preprocess_action','preprocess_comment'],
                                      values=['Drop column','missing values>='+str(missing_threshold)+'% of total'],
                                      modify_action='update_values')

In [131]:
#drop features with cardinality>threshold
cardinality_threshold=80.0
for idx in observations_df[observations_df.cardinality<cardinality_threshold].index:
  observations_df=modify_observations(df=observations_df,
                                      index=idx,
                                      columns=['preprocess_action','preprocess_comment'],
                                      values=['Drop column','High Cardinality, threshold '+str(cardinality_threshold)+'% of total'],
                                      modify_action='update_values')

In [132]:
#Separate target column
observations_df=modify_observations(df=observations_df,
                                    index='CASE_STATUS',
                                    columns=['preprocess_action','preprocess_comment'],
                                    values=['Pop column into a separate list','Target feature'],
                                    modify_action='update_values')

In [176]:
#Drop columns - features that are not useful for prediction based on general knowledge of PERm applications
not_useful_cols=['EMPLOYER_NAME', 'EMPLOYER_ADDRESS_1',
       'EMPLOYER_ADDRESS_2', 'EMPLOYER_CITY', 'EMPLOYER_STATE_PROVINCE',
       'EMPLOYER_COUNTRY', 'EMPLOYER_PHONE',
       'EMPLOYER_PHONE_EXT', 'EMPLOYER_NUM_EMPLOYEES',
       'EMPLOYER_YEAR_COMMENCED_BUSINESS', 'NAICS_CODE',
       'EMP_CONTACT_NAME',
       'EMP_CONTACT_ADDRESS_1', 'EMP_CONTACT_ADDRESS_2',
       'EMP_CONTACT_CITY', 'EMP_CONTACT_STATE_PROVINCE',
       'EMP_CONTACT_COUNTRY', 'EMP_CONTACT_POSTAL_CODE',
       'EMP_CONTACT_PHONE', 'EMP_CONTACT_EMAIL', 'AGENT_ATTORNEY_NAME',
       'AGENT_ATTORNEY_FIRM_NAME', 'AGENT_ATTORNEY_PHONE',
       'AGENT_ATTORNEY_PHONE_EXT', 'AGENT_ATTORNEY_ADDRESS_1',
       'AGENT_ATTORNEY_ADDRESS_2', 'AGENT_ATTORNEY_CITY',
       'AGENT_ATTORNEY_STATE_PROVINCE', 'AGENT_ATTORNEY_COUNTRY',
       'AGENT_ATTORNEY_POSTAL_CODE', 'AGENT_ATTORNEY_EMAIL','WORKSITE_ADDRESS_1',
       'WORKSITE_ADDRESS_2', 'WORKSITE_CITY', 'WORKSITE_STATE',
       'JOB_TITLE','PW_SOC_CODE', 'PW_SOC_TITLE', 'PW_DETERMINATION_DATE','PW_EXPIRATION_DATE',
       'SWA_JOB_ORDER_START_DATE','SWA_JOB_ORDER_END_DATE',
       'FIRST_ADVERTISEMENT_START_DATE','SECOND_ADVERTISEMENT_TYPE','SECOND_AD_START_DATE','JOB_SEARCH_WEBSITE_TO_DATE','LOCAL_ETHNIC_PAPER_TO_DATE',
       'FOREIGN_WORKER_INFO_MAJOR','FOREIGN_WORKER_INST_OF_ED','FOREIGN_WORKER_ED_INST_CITY', 'FOREIGN_WORKER_ED_INST_STATE_P',
       'FOREIGN_WORKER_ED_INST_POST_CD','PREPARER_NAME', 'PREPARER_TITLE', 'PREPARER_EMAIL',
       'EMP_INFO_DECL_NAME', 'EMP_INFO_DECL_TITLE']
observations_df=modify_observations(df=observations_df,
                                    index=not_useful_cols,
                                    columns=['preprocess_action','preprocess_comment'],
                                    values=['Drop column','Not Useful'],
                                    modify_action='update_values')  

In [190]:
#drop temporary features created during feature engineering
observations_df=modify_observations(df=observations_df,
                                    index=['PW_WAGE_PER_HR','WAGE_PER_HR'],
                                    columns=['preprocess_action','preprocess_comment'],
                                    values=['Drop column','Temporary feature'],
                                    modify_action='update_values')

In [134]:
#FEATURE Engineering - date columns
#Create a new feature - PROCESSING_DAYS from 'RECEIVED_DATE', 'DECISION_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['RECEIVED_DATE', 'DECISION_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','PROCESSING_DAYS','days(DECISION_DATE-RECEIVED_DATE)'],
                                    modify_action='update_values')

In [135]:
#Feature engineering - Worksite same as employer address 
observations_df=modify_observations(df=observations_df,
                                    index=['WORKSITE_POSTAL_CODE','EMPLOYER_POSTAL_CODE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','EMPLOYER_WORKSITE_YN','IF EMPLOYER_POSTAL_CODE==WORKSITE_POSTAL_CODE THEN Y ELSE N END'],
                                    modify_action='update_values')

In [136]:
#Feature engineering - convert PREVAILING_WAGE and WAGE_RATE_OF_PAY_FROM to hourly wage - if PW_UNIT_OF_PAY=Hour ignore, if Month then WAGE/172, if Year then WAGE/2067
#Feature engineering - WAGE_ABOVE_PREVAILING_HR = WAGE_RATE_OF_PAY_FROM_HR-PREVAILING_WAGE_HR
observations_df=modify_observations(df=observations_df,
                                    index=['PW_WAGE','PW_UNIT_OF_PAY'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','PREVAILING_WAGE_HR;WAGE_ABOVE_PREVAILING_HR','if PW_UNIT_OF_PAY=Hour ignore, if Month then WAGE/172, if Year then WAGE/2067;WAGE_RATE_OF_PAY_FROM_HR-PREVAILING_WAGE_HR'],
                                    modify_action='update_values')

observations_df=modify_observations(df=observations_df,
                                    index=['WAGE_OFFER_FROM','WAGE_OFFER_UNIT_OF_PAY'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','WAGE_RATE_OF_PAY_FROM_HR;WAGE_ABOVE_PREVAILING_HR','if WAGE_UNIT_OF_PAY=Hour ignore, if Month then WAGE/172, if Year then WAGE/2067;WAGE_RATE_OF_PAY_FROM_HR-PREVAILING_WAGE_HR'],
                                    modify_action='update_values')

In [137]:
#FEATURE Engineering - advertisement columns - convert them to binary indicators for each type of advertisement
#Create a new feature - NEWSPAPER_1 from 'FIRST_NEWSPAPER_NAME'
observations_df=modify_observations(df=observations_df,
                                    index=['FIRST_NEWSPAPER_NAME'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','NEWSPAPER_1','If FIRST_NEWSPAPER_NAME is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - NEWSPAPER_2 from 'SECOND_NEWSPAPER_AD_NAME'
observations_df=modify_observations(df=observations_df,
                                    index=['SECOND_NEWSPAPER_AD_NAME'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','NEWSPAPER_2','If SECOND_NEWSPAPER_AD_NAME is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - JOB_FAIR from 'JOB_FAIR_FROM_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['JOB_FAIR_FROM_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','JOB_FAIR','If JOB_FAIR_FROM_DATE is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - ON_CAMPUS from 'ON_CAMPUS_RECRUITING_FROM_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['ON_CAMPUS_RECRUITING_FROM_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','ON_CAMPUS','If ON_CAMPUS_RECRUITING_FROM_DATE is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - EMPLOYER_WEBSITE from 'EMPLOYER_WEBSITE_FROM_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['EMPLOYER_WEBSITE_FROM_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','EMPLOYER_WEBSITE','If EMPLOYER_WEBSITE_FROM_DATE is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - PRO_ORG_AD from 'PRO_ORG_AD_FROM_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['PRO_ORG_AD_FROM_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','PRO_ORG_AD','If PRO_ORG_AD_FROM_DATE is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - JOB_SEARCH_WEBSITE from 'JOB_SEARCH_WEBSITE_FROM_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['JOB_SEARCH_WEBSITE_FROM_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','JOB_SEARCH_WEBSITE','If JOB_SEARCH_WEBSITE_FROM_DATE is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - PVT_EMPLOYMENT_FIRM from 'PVT_EMPLOYMENT_FIRM_FROM_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['PVT_EMPLOYMENT_FIRM_FROM_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','PVT_EMPLOYMENT_FIRM','If PVT_EMPLOYMENT_FIRM_FROM_DATE is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - EMPLOYEE_REF_PROG from 'EMPLOYEE_REF_PROG_FROM_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['EMPLOYEE_REF_PROG_FROM_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','EMPLOYEE_REF_PROG','If EMPLOYEE_REF_PROG_FROM_DATE is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - CAMPUS_PLACEMENT from 'CAMPUS_PLACEMENT_FROM_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['CAMPUS_PLACEMENT_FROM_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','CAMPUS_PLACEMENT','If CAMPUS_PLACEMENT_FROM_DATE is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - LOCAL_ETHNIC_PAPER from 'LOCAL_ETHNIC_PAPER_FROM_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['LOCAL_ETHNIC_PAPER_FROM_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','LOCAL_ETHNIC_PAPER','If LOCAL_ETHNIC_PAPER_FROM_DATE is not empty then Y else N'],
                                    modify_action='update_values')

#Create a new feature - RADIO_TV_AD from 'RADIO_TV_AD_FROM_DATE'
observations_df=modify_observations(df=observations_df,
                                    index=['RADIO_TV_AD_FROM_DATE'],
                                    columns=['preprocess_action','preprocess_comment','new_feature_name','new_feature_logic'],
                                    values=['Drop column','Feature engineering','RADIO_TV_AD','If RADIO_TV_AD_FROM_DATE is not empty then Y else N'],
                                    modify_action='update_values')

array(['RECEIVED_DATE', 'DECISION_DATE', 'EMPLOYER_POSTAL_CODE',
       'PW_WAGE', 'PW_UNIT_OF_PAY', 'WAGE_OFFER_FROM',
       'WAGE_OFFER_UNIT_OF_PAY', 'WORKSITE_POSTAL_CODE',
       'FIRST_NEWSPAPER_NAME', 'SECOND_NEWSPAPER_AD_NAME',
       'JOB_FAIR_FROM_DATE', 'ON_CAMPUS_RECRUITING_FROM_DATE',
       'EMPLOYER_WEBSITE_FROM_DATE', 'PRO_ORG_AD_FROM_DATE',
       'JOB_SEARCH_WEBSITE_FROM_DATE', 'PVT_EMPLOYMENT_FIRM_FROM_DATE',
       'EMPLOYEE_REF_PROG_FROM_DATE', 'CAMPUS_PLACEMENT_FROM_DATE',
       'LOCAL_ETHNIC_PAPER_FROM_DATE', 'RADIO_TV_AD_FROM_DATE'],
      dtype=object)

In [148]:
#Update details for new features - Numeric
observations_df=modify_observations(observations_df,
                                    index=['PROCESSING_DAYS','WAGE_ABOVE_PW_HR'],
                                    columns=['preprocess_action','preprocess_comment','categorical_class','embedding'],
                                    values=['New feature','Feature engineering','Numerical','Standard scaling'],
                                    modify_action='update_values')

In [149]:
#Update details for new features - Binary
observations_df=modify_observations(observations_df,
                                    index=['EMPLOYER_WORKSITE_YN','NEWSPAPER_1','NEWSPAPER_2','JOB_FAIR','ON_CAMPUS_RECRUITING','EMPLOYER_WEBSITE','PRO_ORG_AD',
                                           'JOB_SEARCH_WEBSITE','PVT_EMPLOYMENT_FIRM', 'EMPLOYEE_REF_PROG', 'CAMPUS_PLACEMENT','LOCAL_ETHNIC_PAPER', 'RADIO_TV_AD'],
                                    columns=['preprocess_action','preprocess_comment','categorical_class','embedding'],
                                    values=['New feature','Feature engineering','Binary','Standardized random'],
                                    modify_action='update_values')

In [194]:
#Categorical columns 
cat_cols=['PW_WAGE_SOURCE','COUNTRY_OF_CITIZENSHIP','FOREIGN_WORKER_BIRTH_COUNTRY','CLASS_OF_ADMISSION','FOREIGN_WORKER_EDUCATION','FOREIGN_WORKER_ED_INST_COUNTRY',]
observations_df=modify_observations(df=observations_df,
                                    index=cat_cols,
                                    columns=['categorical_class', 'embedding'],
                                    values=['Categorical','Standardized random'],
                                    modify_action='update_values')

In [195]:
#Ordinal columns
ord_cols=['PW_SKILL_LEVEL','MINIMUM_EDUCATION']
observations_df=modify_observations(df=observations_df,
                                    index=ord_cols,
                                    columns=['categorical_class', 'embedding'],
                                    values=['Ordinal','Ordered standardized random'],
                                    modify_action='update_values')

In [196]:
#binary columns
binary_cols=['REFILE','SCHD_A_SHEEPHERDER','FW_OWNERSHIP_INTEREST','REQUIRED_TRAINING','REQUIRED_EXPERIENCE','ACCEPT_ALT_FIELD_OF_STUDY','ACCEPT_ALT_COMBO',
             'ACCEPT_FOREIGN_EDUCATION','ACCEPT_ALT_OCCUPATION','JOB_OPP_REQUIREMENTS_NORMAL','FOREIGN_LANGUAGE_REQUIRED','COMBINATION_OCCUPATION',
             'OFFERED_TO_APPL_FOREIGN_WORKER','FOREIGN_WORKER_LIVE_ON_PREM','FOREIGN_WORKER_LIVE_IN_DOM_SER','PROFESSIONAL_OCCUPATION','APP_FOR_COLLEGE_U_TEACHER',
             'SUNDAY_EDITION_NEWSPAPER','EMP_RECEIVED_PAYMENT','POSTED_NOTICE_AT_WORKSITE','LAYOFF_IN_PAST_SIX_MONTHS','FOREIGN_WORKER_ALT_OCC_EXP',
             'FOREIGN_WORKER_EXP_WITH_EMPL','FOREIGN_WORKER_EMPL_PAY_FOR_ED','FOREIGN_WORKER_CURR_EMPLOYED','EMPLOYER_COMPLETED_APPLICATION']
observations_df=modify_observations(df=observations_df,
                                    index=binary_cols,
                                    columns=['categorical_class', 'embedding'],
                                    values=['Binary','Standardized random'],
                                    modify_action='update_values')

In [197]:
#numeric columns
numeric_cols=['ACCEPT_ALT_OCCUPATION_MONTHS','FOREIGN_WORKER_YRS_ED_COMP']
observations_df=modify_observations(df=observations_df,
                                    index=numeric_cols,
                                    columns=['categorical_class', 'embedding'],
                                    values=['Numerical','Standard scaling'],
                                    modify_action='update_values')

In [203]:
#use features as is
use_as_is=observations_df[~observations_df.preprocess_action.isin(['Drop column','New feature','Pop column into a separate list'])].index.values
observations_df=modify_observations(observations_df,
                                    index=use_as_is,
                                    columns=['preprocess_action','preprocess_comment'],
                                    values=['Use feature as is','Use feature as is'],
                                    modify_action='update_values')

In [204]:
observations_df

Unnamed: 0,Dtype,percent_missing,cardinality,preprocess_action,preprocess_comment,new_feature_name,new_feature_logic,categorical_class,embedding
CASE_NUMBER,object,0,0,Drop column,"High Cardinality, threshold 80.0% of total",,,,
CASE_STATUS,object,0,99.9965,Pop column into a separate list,Target feature,,,,
RECEIVED_DATE,datetime64[ns],0,98.6915,Drop column,Feature engineering,PROCESSING_DAYS,days(DECISION_DATE-RECEIVED_DATE),,
DECISION_DATE,datetime64[ns],0,1.70387,Drop column,Feature engineering,PROCESSING_DAYS,days(DECISION_DATE-RECEIVED_DATE),,
REFILE,object,0,99.9965,Use feature as is,Use feature as is,,,Binary,Standardized random
...,...,...,...,...,...,...,...,...,...
PVT_EMPLOYMENT_FIRM,object,0,99.9965,New feature,Feature engineering,,,Binary,Standardized random
EMPLOYEE_REF_PROG,object,0,99.9965,New feature,Feature engineering,,,Binary,Standardized random
CAMPUS_PLACEMENT,object,0,99.9965,New feature,Feature engineering,,,Binary,Standardized random
LOCAL_ETHNIC_PAPER,object,0,99.9965,New feature,Feature engineering,,,Binary,Standardized random


In [208]:
#Feature engineering source columns list
fe_cols=observations_df[(observations_df.preprocess_action=='Drop column') & (observations_df.preprocess_comment=='Feature engineering')].index.values

In [206]:
required_features=list(observations_df[(observations_df.preprocess_comment.isin(['Feature engineering','Target feature','Use feature as is'])) & (~observations_df.preprocess_action.isin(['New feature']))].index)

In [211]:
numeric_cols=observations_df[(observations_df.preprocess_action!='Drop column') & (observations_df.categorical_class=='Numerical')].index.values

In [213]:
categorical_cols=observations_df[(observations_df.preprocess_action!='Drop column') & (observations_df.categorical_class.isin(['Binary','Ordinal','Categorical']))].index.values

In [215]:
drop_cols=observations_df[(observations_df.preprocess_action=='Drop column') & (observations_df.preprocess_comment.isin(['Feature engineering','Temporary feature']))].index.values

In [207]:
observations_df.to_csv('/content/drive/MyDrive/final_observations_PERM.csv',sep='$')  

In [220]:
pd.DataFrame(required_features).to_csv('/content/drive/MyDrive/required_features_PERM.csv',index=False,header=False)
pd.DataFrame(drop_cols).to_csv('/content/drive/MyDrive/drop_columns_PERM.csv',index=False,header=False)
pd.DataFrame(fe_cols).to_csv('/content/drive/MyDrive/feature_engineering_columns_PERM.csv',index=False,header=False)
pd.DataFrame(categorical_cols).to_csv('/content/drive/MyDrive/categorical_columns_PERM.csv',index=False,header=False)
pd.DataFrame(numeric_cols).to_csv('/content/drive/MyDrive/numeric_columns_PERM.csv',index=False,header=False)