In [2]:
import sys; 
sys.path.insert(0, '..')

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.window import Window
import pyspark.sql.functions as f
from pyspark.sql.functions import when
from pyspark.sql.types import IntegerType,BooleanType,DateType

from utils.year_month import get_year_month
from utils.avg_sal import get_avg_sal
from utils.daily_avg_sal import get_daily_avg_sal
from utils.dropping_feauture import drop_feauture
from utils.encoding import one_hot_encoding
from utils.gap_filling import gap_filling_by_mode
from utils.removing_special_char import remove_special_chars

spark = SparkSession.builder.appName("pyspark-ETL_Final").getOrCreate() 

def file_read(filename):
    # filename: File name with path to extract data
    
    return spark.read.options(inferSchema='True').option('escape','"').csv(filename, header=True)

def write_data(df_spark,filename):
    # df_spark : Input spark dataframe 
    # filename : Target file path
    df_spark.coalesce(1).write.mode('overwrite').option('header',True).option("encoding", "UTF-8").option("escape","\"").csv(filename)
    
    
def main():
    #Creating spark session variable
    #extracting data from the provided file
    df_spark=file_read("/dataset/nyc-jobs.csv")
    
    # Categorical Imputation for filling missing 'Full-Time/Part-Time indicator' and 'Posting Date'
    df_spark=gap_filling_by_mode(df_spark,['Salary Frequency'],'Full-Time/Part-Time indicator')
    df_spark=gap_filling_by_mode(df_spark,['Posting Type','Agency'],'Posting Date')
    
    # Creating Feature salary_avg for quantifying salary across multiple job titles
    df_spark=get_avg_sal(df_spark, "Salary Range From","Salary Range To")
    
    # Creating Feature daily_salary_avg for normalizing salary across multiple job titles irrspective of Salary Frquency
    df_spark=get_daily_avg_sal(df_spark, "salary_avg","Salary Frequency")
    
    """df_spark=get_year_month(df_spark,"Posting Date", "Job_Post_Year","Job_Post_Month")
    df_spark=get_year_month(df_spark,"Process Date", "data_processed_year","data_processed_month")
    """
    
    #Cleaning data by removing special characters in "Minimum Qual Requirements" column
    df_spark=remove_special_chars(df_spark,"Minimum Qual Requirements")
    
    #One Hot encoding for Posting Type and Salary Frequency for converting Catgorical features to Numerical features
    df_spark=one_hot_encoding(df_spark,'Posting Type')
    df_spark=one_hot_encoding(df_spark,'Salary Frequency')
    #Reason for using only two categorical columns for One Hot encoding is because they didnt have any null values
    # The 'Full-Time/Part-Time indicator' has been edited for removing the NULL values, so didnt use for encoding.
    
    #Feature Selection by removing the feautures with redundant data like, 'Posting Type','Salary Frequency', 'salary_avg'
    #'Recruitment Contact' is dropping as all are null values
    df_spark=drop_feauture(df_spark,['Posting Type','Salary Frequency', 'salary_avg', 'Recruitment Contact'])
    
    #writing data to csv file
    write_data(df_spark,"/dataset/output.csv")

#if __name__ == "__main__":
    #main()

In [6]:
main()