# Human Resources Dataset

The original file can be found here <https://www.kaggle.com/rhuebner/human-resources-data-set>

Description can be found here <https://rpubs.com/rhuebner/hrd_cb_v14>

In [None]:
import sys
sys.path.append("..")
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.functions import expr, unix_timestamp, col
from pyspark.sql.session import SparkSession
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df


In [None]:
input_file = translate_to_file_string("../../data/HRDataset_v14.csv")

In [None]:
spark = (SparkSession
       .builder
       .appName("HumanResources")
       .getOrCreate())
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

In [None]:
# load data file.
# create a DataFrame using an infered Schema
df = spark.read.option("header", "true") \
    .option("inferSchema", "true") \
    .option("delimiter", ",") \
    .csv(input_file) 
df.printSchema()

## Data Preprocessing

In [None]:
# Fill null values
df = df.na.fill( {'TermReason': 'Unknown', 'ManagerID': 0, 'DaysLateLast30': 0, 'DateofTermination': '1/01/2022', 'LastPerformanceReview_Date': '1/01/2022'} )

# Converte date-strings to dates
df = df.withColumn('DateofHire', unix_timestamp(col('DateofHire'), 'M/dd/yyyy'))
df = df.withColumn('DateofTermination', unix_timestamp(col('DateofTermination'), 'M/dd/yyyy'))

# Add new column for days worked in company
df = df.withColumn("DaysWorked", ((col("DateofTermination") - col("DateofHire"))/86400))


## Indexing string columns 

In [None]:
stateIdIndexer = StringIndexer().setInputCol("State").setOutputCol("State-num").fit(df)
sexIndexer = StringIndexer().setInputCol("Sex").setOutputCol("Sex-num").fit(df)
recruitSourceIndexer = StringIndexer().setInputCol("RecruitmentSource").setOutputCol("RecruitmentSource-num").fit(df)

# Removing Feature Columns #
Used to simplify the process.

In [None]:
# labels are Termd, EmpStatusID, Salary

featureCols = ['MarriedID', 'MaritalStatusID', 'DeptID', 'PerfScoreID', 'FromDiversityJobFairID', 'SpecialProjectsCount', 'State-num', 'Sex-num','RecruitmentSource-num','EngagementSurvey','EmpSatisfaction', 'Absences']
featureCols.append("DaysWorked")
print (featureCols)

In [None]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols), handleInvalid='skip')

In [None]:
labeled_point_ds = assembler.transform(recruitSourceIndexer.transform(sexIndexer.transform(stateIdIndexer.transform(df))))
print_df(labeled_point_ds,10)

In [None]:
#split data for testing
splits = labeled_point_ds.randomSplit([0.6, 0.4 ], 5756)
train = splits[0]
test = splits[1]

In [None]:
spark.stop()