<a href="https://colab.research.google.com/github/sharsulkar/H1B_LCA_outcome_prediction/blob/main/prototyping/notebooks/PERM/01_sh_PERM_standardize_source_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The feature names do not match between FY20 file and FY19, FY18 files. In order to have additional training data for PERM outcome prediction, we first need to standardize the source data files so that the feature names match, they are in same order and any missing features are taken care of.  
This notebook will implement these changes.  
This is a one-time step as standardized datasets are stored in separate files which will be sourced for training later.  

In [None]:
import numpy as np
import pandas as pd


In [None]:
def read_csv_to_list(filepath,header=None,squeeze=True):
  return list(pd.read_csv(filepath,header=None,squeeze=True))

In [None]:
required_features=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/processed/required_features_PERM.csv',header=None,squeeze=True)

In [None]:
#One time processing for FY19 source data - 
#the features names in FY19 file are different than FY20 hence will need to be renamed to match FY20. Also few features are missing

required_features_fy19=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/interim/required_features_PERM_FY19.csv',header=None,squeeze=True)
fy19_df=pd.read_excel('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2019.xlsx',usecols=required_features_fy19)
fy19_df_copy=fy19_df.copy()

#pop the last column so the missing columns are in the same order as FY20 dataset
preparer=fy19_df.pop('PREPARER_INFO_EMP_COMPLETED')
#Add in missing features as blank
#FOREIGN_WORKER_ED_INST_COUNTRY  
fy19_df['FOREIGN_WORKER_ED_INST_COUNTRY']=np.NaN
#FOREIGN_WORKER_ALT_OCC_EXP 
fy19_df['FOREIGN_WORKER_ALT_OCC_EXP']=np.NaN
#FOREIGN_WORKER_EXP_WITH_EMPL
fy19_df['FOREIGN_WORKER_EXP_WITH_EMPL']=np.NaN
#FOREIGN_WORKER_EMPL_PAY_FOR_ED
fy19_df['FOREIGN_WORKER_EMPL_PAY_FOR_ED']=np.NaN
#FOREIGN_WORKER_CURR_EMPLOYED
fy19_df['FOREIGN_WORKER_CURR_EMPLOYED']=np.NaN

#add the popped column back in the dataset
fy19_df['PREPARER_INFO_EMP_COMPLETED']=preparer

#fix WAGE_OFFERED_FROM_9089
fy19_df.loc[fy19_df[fy19_df.WAGE_OFFERED_FROM_9089=='#############'].index,'WAGE_OFFERED_FROM_9089']=np.NaN
fy19_df.WAGE_OFFERED_FROM_9089=fy19_df.WAGE_OFFERED_FROM_9089.astype(str).apply(lambda x: x.replace(',','')).astype(float)

#fix PW_AMOUNT_9089
fy19_df.loc[fy19_df[fy19_df.PW_AMOUNT_9089=='#############'].index,'PW_AMOUNT_9089']=np.NaN
fy19_df.PW_AMOUNT_9089=fy19_df.PW_AMOUNT_9089.astype(str).apply(lambda x: x.replace(',','')).astype(float)

#Add missing column names to required features list
for missing_col in ['FOREIGN_WORKER_ED_INST_COUNTRY','FOREIGN_WORKER_ALT_OCC_EXP','FOREIGN_WORKER_EXP_WITH_EMPL','FOREIGN_WORKER_EMPL_PAY_FOR_ED','FOREIGN_WORKER_CURR_EMPLOYED']:
  required_features_fy19.insert(-1,missing_col)

#replace the column names to match FY20
fy19_df.rename(columns=dict(zip(list(fy19_df[required_features_fy19].columns.values),required_features)),inplace=True)

#check if it worked
#fy19_df[required_features].head()

#save the processed file for future use
fy19_df.to_excel('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2019_processed.xlsx',index=False)

In [None]:
#One time processing for FY18 source data by piggybacking on the steps taken for FY19 above- 
#the features names in fy 18 file are different than FY20 hence will need to be renamed to match FY20. Also few features are missing

required_features_fy19=read_csv_to_list('https://raw.githubusercontent.com/sharsulkar/H1B_LCA_outcome_prediction/main/data/interim/required_features_PERM_FY19.csv',header=None,squeeze=True)
#Fix the difference in feature names between FY18 and FY19 files
required_features_fy19[11]='WAGE_OFFER_FROM_9089'
fy18_df=pd.read_excel('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2018.xlsx',usecols=required_features_fy19)
fy18_df_copy=fy18_df.copy()


#pop the last column so the missing columns are in the same order as FY20 dataset
preparer=fy18_df.pop('PREPARER_INFO_EMP_COMPLETED')
#Add in missing features as blank
#FOREIGN_WORKER_ED_INST_COUNTRY  
fy18_df['FOREIGN_WORKER_ED_INST_COUNTRY']=np.NaN
#FOREIGN_WORKER_ALT_OCC_EXP 
fy18_df['FOREIGN_WORKER_ALT_OCC_EXP']=np.NaN
#FOREIGN_WORKER_EXP_WITH_EMPL
fy18_df['FOREIGN_WORKER_EXP_WITH_EMPL']=np.NaN
#FOREIGN_WORKER_EMPL_PAY_FOR_ED
fy18_df['FOREIGN_WORKER_EMPL_PAY_FOR_ED']=np.NaN
#FOREIGN_WORKER_CURR_EMPLOYED
fy18_df['FOREIGN_WORKER_CURR_EMPLOYED']=np.NaN

#add the popped column back in the dataset
fy18_df['PREPARER_INFO_EMP_COMPLETED']=preparer

#JOB_INFO_ALT_OCC value should be in [Y,N], fix that
fy18_df.loc[fy18_df[~fy18_df.JOB_INFO_ALT_OCC.isna()].index,'JOB_INFO_ALT_OCC']='Y'
fy18_df.loc[fy18_df[fy18_df.JOB_INFO_ALT_OCC.isna()].index,'JOB_INFO_ALT_OCC']='N'

#fix WAGE_OFFER_FROM_9089
fy18_df.loc[fy18_df[fy18_df.WAGE_OFFER_FROM_9089=='#############'].index,'WAGE_OFFER_FROM_9089']=np.NaN
fy18_df.WAGE_OFFER_FROM_9089=fy18_df.WAGE_OFFER_FROM_9089.astype(str).apply(lambda x: x.replace(',','')).astype(float)

#fix PW_AMOUNT_9089
fy18_df.loc[fy18_df[fy18_df.PW_AMOUNT_9089=='#############'].index,'PW_AMOUNT_9089']=np.NaN
fy18_df.PW_AMOUNT_9089=fy18_df.PW_AMOUNT_9089.astype(str).apply(lambda x: x.replace(',','')).astype(float)

#Add missing column names to required features list
for missing_col in ['FOREIGN_WORKER_ED_INST_COUNTRY','FOREIGN_WORKER_ALT_OCC_EXP','FOREIGN_WORKER_EXP_WITH_EMPL','FOREIGN_WORKER_EMPL_PAY_FOR_ED','FOREIGN_WORKER_CURR_EMPLOYED']:
  required_features_fy19.insert(-1,missing_col)

#replace the column names to match FY20
fy18_df.rename(columns=dict(zip(list(fy18_df[required_features_fy19].columns.values),required_features)),inplace=True)

#check if it worked
#fy18_df[required_features].head()

#save the processed file for future use - there is some dataerror saving to excel so saving to csv
fy18_df.to_csv('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2018_processed.csv',index=False)

In [None]:
#check if 
#temp_df=pd.read_excel('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2019_processed.xlsx',usecols=required_features)
#temp_df=pd.read_csv('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2018_processed.csv')

In [None]:
#Concat all sources into a single dataframe and save
fy21_df=pd.read_excel('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2021_Q1.xlsx',usecols=required_features)
#fy21_dfcopy=fy21_df.copy()

fy20_df=pd.read_excel('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2020.xlsx',usecols=required_features)
#fy20_dfcopy=fy20_df.copy()

fy19_df=pd.read_excel('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2019_processed.xlsx')
#fy19_dfcopy=fy19_df.copy()

fy18_df=pd.read_csv('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY2018_processed.csv')
#fy18_dfcopy=fy18_df.copy()

#concat dataframes into a single dataset
frames=[fy21_df,fy20_df,fy19_df,fy18_df]
data_df=pd.concat(frames,ignore_index=True)
data_dfcopy=data_df.copy()

#one time operations - save the concat df to file for future use
data_df.to_csv('/content/drive/MyDrive/Datasets/PERM_prediction/PERM_Disclosure_Data_FY18_to_20.csv',index=False)