## Installs

In [1]:
!pip install pyspark==3.1.2 -q
!pip install findspark -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-spark-connect 0.5.2 requires pyspark>=3.5, but you have pyspark 3.1.2 which is incompatible.[0m[31m
[0m

## Imports

In [92]:
import kagglehub
import findspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, sum, when, udf
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
from pyspark.sql.functions import lit, to_date, datediff, current_date, expr
from pyspark.sql.functions import explode
from datetime import datetime
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import warnings

## Utilities

In [3]:
def warn(*args, **kwargs):
    pass
findspark.init()

warnings.warn = warn
warnings.filterwarnings('ignore')

## Session Initialization

In [4]:
spark = SparkSession.builder.appName("PySpark_App").getOrCreate()

## Download the data file and Load the dataset


In [5]:
path = kagglehub.dataset_download("rhuebner/human-resources-data-set")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rhuebner/human-resources-data-set?dataset_version_number=4...


100%|██████████| 16.6k/16.6k [00:00<00:00, 16.8MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/rhuebner/human-resources-data-set/versions/4





In [82]:
hr_data = spark.read.csv(path, header=True, inferSchema=True) # Load CSV

#df = spark.read.parquet("path_to_file.parquet") # Load parquet
#df = spark.read.json("path_to_file.json") # Load JSON

## Data Exploration

In [75]:
hr_data.printSchema()
# hr_data.dtypes # To display column data types

root
 |-- Employee_Name: string (nullable = true)
 |-- EmpID: integer (nullable = true)
 |-- MarriedID: integer (nullable = true)
 |-- MaritalStatusID: integer (nullable = true)
 |-- GenderID: integer (nullable = true)
 |-- EmpStatusID: integer (nullable = true)
 |-- DeptID: integer (nullable = true)
 |-- PerfScoreID: integer (nullable = true)
 |-- FromDiversityJobFairID: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Termd: integer (nullable = true)
 |-- PositionID: integer (nullable = true)
 |-- Position: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- MaritalDesc: string (nullable = true)
 |-- CitizenDesc: string (nullable = true)
 |-- HispanicLatino: string (nullable = true)
 |-- RaceDesc: string (nullable = true)
 |-- DateofHire: string (nullable = true)
 |-- DateofTermination: string (nullable = true)
 |-- TermReason: string (nullable

In [58]:
hr_data.show(3, truncate=False)

+------------------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+-----+----------+------------------------+-----+----+--------+---+-----------+-----------+--------------+--------+----------+-----------------+-----------------+----------------------+-----------------+--------------+---------+-----------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+
|Employee_Name           |EmpID|MarriedID|MaritalStatusID|GenderID|EmpStatusID|DeptID|PerfScoreID|FromDiversityJobFairID|Salary|Termd|PositionID|Position                |State|Zip |DOB     |Sex|MaritalDesc|CitizenDesc|HispanicLatino|RaceDesc|DateofHire|DateofTermination|TermReason       |EmploymentStatus      |Department       |ManagerName   |ManagerID|RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Date|DaysLateLast30|Absences|
+-------

In [59]:
print(f"Rows: {hr_data.count()}, Columns: {len(hr_data.columns)})")

Rows: 311, Columns: 36)


In [60]:
# Check for duplicate rows
total_rows = hr_data.count()
distinct_rows = hr_data.distinct().count()
duplicates_count = total_rows - distinct_rows

print(f"Number of duplicate rows: {duplicates_count}")

Number of duplicate rows: 0


In [76]:
hr_data.dropDuplicates() # Drop duplicate rows, if any

DataFrame[Employee_Name: string, EmpID: int, MarriedID: int, MaritalStatusID: int, GenderID: int, EmpStatusID: int, DeptID: int, PerfScoreID: int, FromDiversityJobFairID: int, Salary: int, Termd: int, PositionID: int, Position: string, State: string, Zip: int, DOB: string, Sex: string, MaritalDesc: string, CitizenDesc: string, HispanicLatino: string, RaceDesc: string, DateofHire: string, DateofTermination: string, TermReason: string, EmploymentStatus: string, Department: string, ManagerName: string, ManagerID: int, RecruitmentSource: string, PerformanceScore: string, EngagementSurvey: double, EmpSatisfaction: int, SpecialProjectsCount: int, LastPerformanceReview_Date: string, DaysLateLast30: int, Absences: int]

In [83]:
# Check for null values
null_counts = hr_data.select([sum(col(c).isNull().cast("int")).alias(c) for c in hr_data.columns])
null_counts.show()

+-------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+-----+----------+--------+-----+---+---+---+-----------+-----------+--------------+--------+----------+-----------------+----------+----------------+----------+-----------+---------+-----------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+
|Employee_Name|EmpID|MarriedID|MaritalStatusID|GenderID|EmpStatusID|DeptID|PerfScoreID|FromDiversityJobFairID|Salary|Termd|PositionID|Position|State|Zip|DOB|Sex|MaritalDesc|CitizenDesc|HispanicLatino|RaceDesc|DateofHire|DateofTermination|TermReason|EmploymentStatus|Department|ManagerName|ManagerID|RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Date|DaysLateLast30|Absences|
+-------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+---

In [84]:
# Drop rows with null values
hr_data_cleaned = hr_data.dropna()

# Impute missing values (for a specific column)
# hr_data_cleaned.fillna({"DateofTermination": 0})

In [85]:
null_counts = hr_data_cleaned.select([sum(col(c).isNull().cast("int")).alias(c) for c in hr_data.columns])
null_counts.show()

+-------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+-----+----------+--------+-----+---+---+---+-----------+-----------+--------------+--------+----------+-----------------+----------+----------------+----------+-----------+---------+-----------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+
|Employee_Name|EmpID|MarriedID|MaritalStatusID|GenderID|EmpStatusID|DeptID|PerfScoreID|FromDiversityJobFairID|Salary|Termd|PositionID|Position|State|Zip|DOB|Sex|MaritalDesc|CitizenDesc|HispanicLatino|RaceDesc|DateofHire|DateofTermination|TermReason|EmploymentStatus|Department|ManagerName|ManagerID|RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Date|DaysLateLast30|Absences|
+-------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+---

## Transformation

In [88]:
# Add a new column with a constant value using lit (literal) function
hr_data_cleaned = hr_data_cleaned.withColumn("PermanentEmployee", lit("yes"))
hr_data_cleaned.show(3)

+--------------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+-----+----------+--------------------+-----+----+--------+---+-----------+-----------+--------------+--------+----------+-----------------+----------------+--------------------+-----------------+--------------+---------+-----------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+-----------------+
|       Employee_Name|EmpID|MarriedID|MaritalStatusID|GenderID|EmpStatusID|DeptID|PerfScoreID|FromDiversityJobFairID|Salary|Termd|PositionID|            Position|State| Zip|     DOB|Sex|MaritalDesc|CitizenDesc|HispanicLatino|RaceDesc|DateofHire|DateofTermination|      TermReason|    EmploymentStatus|       Department|   ManagerName|ManagerID|RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Date|DaysLateLast30|Absences|PermanentEmpl

In [90]:
# Drop columns
hr_data_cleaned = hr_data_cleaned.drop("EmpStatusID","FromDiversityJobFairID")
hr_data_cleaned.show(3)

+--------------------+-----+---------+---------------+--------+------+-----------+------+-----+----------+--------------------+-----+----+--------+---+-----------+-----------+--------------+--------+----------+-----------------+----------------+--------------------+-----------------+--------------+---------+-----------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+-----------------+
|       Employee_Name|EmpID|MarriedID|MaritalStatusID|GenderID|DeptID|PerfScoreID|Salary|Termd|PositionID|            Position|State| Zip|     DOB|Sex|MaritalDesc|CitizenDesc|HispanicLatino|RaceDesc|DateofHire|DateofTermination|      TermReason|    EmploymentStatus|       Department|   ManagerName|ManagerID|RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Date|DaysLateLast30|Absences|PermanentEmployee|
+--------------------+-----+---------+---------------+--------+-

In [91]:
# Rename a column
hr_data_cleaned = hr_data_cleaned.withColumnRenamed("EmpID", "EmpNo")
hr_data_cleaned.show(3)

+--------------------+-----+---------+---------------+--------+------+-----------+------+-----+----------+--------------------+-----+----+--------+---+-----------+-----------+--------------+--------+----------+-----------------+----------------+--------------------+-----------------+--------------+---------+-----------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+-----------------+
|       Employee_Name|EmpNo|MarriedID|MaritalStatusID|GenderID|DeptID|PerfScoreID|Salary|Termd|PositionID|            Position|State| Zip|     DOB|Sex|MaritalDesc|CitizenDesc|HispanicLatino|RaceDesc|DateofHire|DateofTermination|      TermReason|    EmploymentStatus|       Department|   ManagerName|ManagerID|RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Date|DaysLateLast30|Absences|PermanentEmployee|
+--------------------+-----+---------+---------------+--------+-

## Exploratory Data Analysis

In [51]:
# Select specific columns for analysis
cols = ["MaritalStatusID","GenderID", "EmpStatusID", "DeptID", "PerfScoreID", "FromDiversityJobFairID", "Termd", "PositionID", "ManagerID"]

hr_num = hr_data_cleaned.select(cols)

In [17]:
# Show summary statistics
hr_num.select(cols).describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+----------------------+-----+------------------+------------------+
|summary|   MaritalStatusID|          GenderID|       EmpStatusID|            DeptID|       PerfScoreID|FromDiversityJobFairID|Termd|        PositionID|         ManagerID|
+-------+------------------+------------------+------------------+------------------+------------------+----------------------+-----+------------------+------------------+
|  count|               104|               104|               104|               104|               104|                   104|  104|               104|               104|
|   mean|0.9423076923076923|0.4230769230769231| 4.788461538461538| 4.711538461538462|2.9038461538461537|   0.15384615384615385|  1.0|18.134615384615383|16.903846153846153|
| stddev|0.9433149879770784|0.4964399022537007|0.6335763327177987|0.9208805086045133|0.5663583340159615|    0.3625484438800155|  0.0| 4.9757

In [18]:
# Show unique values in a specific column
unique_PerformanceScore = hr_data_cleaned.select(["PerformanceScore", "GenderID"]).distinct()
unique_PerformanceScore.show()

+-----------------+--------+
| PerformanceScore|GenderID|
+-----------------+--------+
|              PIP|       1|
|      Fully Meets|       0|
|      Fully Meets|       1|
|          Exceeds|       1|
|          Exceeds|       0|
|              PIP|       0|
|Needs Improvement|       1|
|Needs Improvement|       0|
+-----------------+--------+



In [19]:
unique_Last = hr_data_cleaned.select(["DaysLateLast30"]).distinct().show()

+--------------+
|DaysLateLast30|
+--------------+
|             1|
|             6|
|             3|
|             5|
|             4|
|             0|
+--------------+



In [20]:
# Filer data by genderID and show count
gen_0 = hr_data_cleaned.filter(hr_data_cleaned.GenderID == 0)
gen_1 = hr_data_cleaned.filter(hr_data_cleaned.GenderID == 1)

print(f"Count of genderID=0: {gen_0.count()},: Count of genderID=1: {gen_1.count()}")

Count of genderID=0: 60,: Count of genderID=1: 44


In [21]:
# Filter data by colummn
top_performer_data = hr_data_cleaned.filter(hr_data_cleaned.PerformanceScore == "Exceeds")
absence_data = hr_data_cleaned.filter(hr_data_cleaned.Absences > 19)

In [22]:
top_performer_data.show(3)

+----------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+-----+----------+--------------------+-----+----+--------+---+-----------+-----------+--------------+--------------------+----------+-----------------+----------------+--------------------+--------------------+---------------+---------+------------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+
|   Employee_Name|EmpID|MarriedID|MaritalStatusID|GenderID|EmpStatusID|DeptID|PerfScoreID|FromDiversityJobFairID|Salary|Termd|PositionID|            Position|State| Zip|     DOB|Sex|MaritalDesc|CitizenDesc|HispanicLatino|            RaceDesc|DateofHire|DateofTermination|      TermReason|    EmploymentStatus|          Department|    ManagerName|ManagerID| RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Date|DaysLateLast30|Absences|
+---

In [23]:
absence_data.show(3)

+--------------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+-----+----------+--------------------+-----+-----+--------+---+-----------+-----------+--------------+-----------------+----------+-----------------+--------------------+--------------------+-----------------+-------------+---------+-----------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+
|       Employee_Name|EmpID|MarriedID|MaritalStatusID|GenderID|EmpStatusID|DeptID|PerfScoreID|FromDiversityJobFairID|Salary|Termd|PositionID|            Position|State|  Zip|     DOB|Sex|MaritalDesc|CitizenDesc|HispanicLatino|         RaceDesc|DateofHire|DateofTermination|          TermReason|    EmploymentStatus|       Department|  ManagerName|ManagerID|RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Date|DaysLateLast30|Absences|
+---

## Data Manipulation

In [24]:
# Increment of 10% in salaries of those with Exceees performance
top_performer_data = hr_data_cleaned.withColumn(
    "Salary", when(col("PerformanceScore") == "Exceeds", col("Salary") * 1.1).otherwise(col("Salary")))

In [25]:
# Deduction of 5% from salaries for those with absences more than 19
absence_data = hr_data_cleaned.withColumn(
    "Salary", when(col("Absences") > 19, col("Salary") * 1.1).otherwise(col("Salary")))

In [26]:
hr_data_cleaned = hr_data_cleaned.withColumn("DateofHire", to_date(col("DateofHire"), "M/d/yyyy"))

hr_data_cleaned = hr_data_cleaned.withColumn("DateofTermination", to_date(col("DateofTermination"), "M/d/yyyy"))

hr_data_cleaned = hr_data_cleaned.withColumn("Tenure", (datediff(col("DateofTermination"), col("DateofHire")) / 365).cast("int"))

hr_data_cleaned.select("DateofHire", "DateofTermination", "Tenure").show(3, False)

+----------+-----------------+------+
|DateofHire|DateofTermination|Tenure|
+----------+-----------------+------+
|2015-03-30|2016-06-16       |1     |
|2011-07-05|2012-09-24       |1     |
|2011-07-11|2016-09-06       |5     |
+----------+-----------------+------+
only showing top 3 rows



## Save the file

In [93]:
hr_data_cleaned.write.mode("overwrite").csv("new_hr_data.csv", header=True)
# #If you do not wish to over write use df.write.csv("new_hr_data.csv", header=True)

## Aggregation & Grouping

In [27]:
# Show average salary per department
hr_data_cleaned.groupBy("DeptID").agg({"Salary": "avg"}).show()

+------+------------------+
|DeptID|       avg(Salary)|
+------+------------------+
|     1|           62761.0|
|     6|           68917.8|
|     3|          107274.9|
|     5| 59246.55421686747|
|     4|102894.33333333333|
+------+------------------+



In [28]:
# Show min and max salaries per department
hr_data_cleaned.groupBy("DeptID").agg(
    F.min("Salary").alias("min(Salary)"),
    F.max("Salary").alias("max(Salary)")
).show()

+------+-----------+-----------+
|DeptID|min(Salary)|max(Salary)|
+------+-----------+-----------+
|     1|      49920|      83363|
|     6|      59370|      74326|
|     3|      75281|     148999|
|     5|      45115|      83082|
|     4|      99280|     108987|
+------+-----------+-----------+



In [29]:
# Show average salary and average Employee Satisfaction per department
hr_data_cleaned.groupBy("DeptID").agg({"Salary": "avg", "EmpSatisfaction": "avg"}).show()

+------+------------------+--------------------+
|DeptID|       avg(Salary)|avg(EmpSatisfaction)|
+------+------------------+--------------------+
|     1|           62761.0|  3.6666666666666665|
|     6|           68917.8|                 4.8|
|     3|          107274.9|                 3.6|
|     5| 59246.55421686747|   3.855421686746988|
|     4|102894.33333333333|   4.333333333333333|
+------+------------------+--------------------+



In [30]:
# Show count values of Employee Satisfaction
hr_data_cleaned.groupBy("EmpSatisfaction").count().show()

+---------------+-----+
|EmpSatisfaction|count|
+---------------+-----+
|              3|   33|
|              5|   29|
|              4|   38|
|              2|    4|
+---------------+-----+



## Sorting

In [31]:
hr_data_cleaned.orderBy("Tenure").show(3)

+--------------------+-----+---------+---------------+--------+-----------+------+-----------+----------------------+------+-----+----------+--------------------+-----+----+--------+---+-----------+-----------+--------------+--------+----------+-----------------+--------------------+--------------------+-----------------+--------------+---------+-----------------+----------------+----------------+---------------+--------------------+--------------------------+--------------+--------+------+
|       Employee_Name|EmpID|MarriedID|MaritalStatusID|GenderID|EmpStatusID|DeptID|PerfScoreID|FromDiversityJobFairID|Salary|Termd|PositionID|            Position|State| Zip|     DOB|Sex|MaritalDesc|CitizenDesc|HispanicLatino|RaceDesc|DateofHire|DateofTermination|          TermReason|    EmploymentStatus|       Department|   ManagerName|ManagerID|RecruitmentSource|PerformanceScore|EngagementSurvey|EmpSatisfaction|SpecialProjectsCount|LastPerformanceReview_Date|DaysLateLast30|Absences|Tenure|
+-------

## DataFrame Caching

In [32]:
hr_data_cleaned.cache()  # Cache DataFrame in memory

# When you cache a DataFrame, Spark keeps the data in memory,
# which can help speed up subsequent actions and transformations that need to access the data,
# as it avoids re-reading or recomputing the data from the source (e.g., from a file, database, etc.).

DataFrame[Employee_Name: string, EmpID: int, MarriedID: int, MaritalStatusID: int, GenderID: int, EmpStatusID: int, DeptID: int, PerfScoreID: int, FromDiversityJobFairID: int, Salary: int, Termd: int, PositionID: int, Position: string, State: string, Zip: int, DOB: string, Sex: string, MaritalDesc: string, CitizenDesc: string, HispanicLatino: string, RaceDesc: string, DateofHire: date, DateofTermination: date, TermReason: string, EmploymentStatus: string, Department: string, ManagerName: string, ManagerID: int, RecruitmentSource: string, PerformanceScore: string, EngagementSurvey: double, EmpSatisfaction: int, SpecialProjectsCount: int, LastPerformanceReview_Date: string, DaysLateLast30: int, Absences: int, Tenure: int]

## Joins

In [33]:
#df1.join(df2, df1["key"] == df2["key"], "inner").show()
#df1.join(df2, on="key", how="left").show()

## Partitioning and Coalescing:

In [34]:
hr_data_cleaned.repartition(5) # Divides the data into 5 subsets and runs in paralell in multiple nodes, if available

DataFrame[Employee_Name: string, EmpID: int, MarriedID: int, MaritalStatusID: int, GenderID: int, EmpStatusID: int, DeptID: int, PerfScoreID: int, FromDiversityJobFairID: int, Salary: int, Termd: int, PositionID: int, Position: string, State: string, Zip: int, DOB: string, Sex: string, MaritalDesc: string, CitizenDesc: string, HispanicLatino: string, RaceDesc: string, DateofHire: date, DateofTermination: date, TermReason: string, EmploymentStatus: string, Department: string, ManagerName: string, ManagerID: int, RecruitmentSource: string, PerformanceScore: string, EngagementSurvey: double, EmpSatisfaction: int, SpecialProjectsCount: int, LastPerformanceReview_Date: string, DaysLateLast30: int, Absences: int, Tenure: int]

In [35]:
hr_data_cleaned.coalesce(1) #Repartitioning

DataFrame[Employee_Name: string, EmpID: int, MarriedID: int, MaritalStatusID: int, GenderID: int, EmpStatusID: int, DeptID: int, PerfScoreID: int, FromDiversityJobFairID: int, Salary: int, Termd: int, PositionID: int, Position: string, State: string, Zip: int, DOB: string, Sex: string, MaritalDesc: string, CitizenDesc: string, HispanicLatino: string, RaceDesc: string, DateofHire: date, DateofTermination: date, TermReason: string, EmploymentStatus: string, Department: string, ManagerName: string, ManagerID: int, RecruitmentSource: string, PerformanceScore: string, EngagementSurvey: double, EmpSatisfaction: int, SpecialProjectsCount: int, LastPerformanceReview_Date: string, DaysLateLast30: int, Absences: int, Tenure: int]

## Exploding an array into rows

ID	Skills

1	["Python", "Java"]

2	["SQL", "Spark", "R"]

3	["Excel", "Power BI"]

In [36]:
# Exploding an array into rows
#hr_data_cleaned.withColumn("GenderID", explode(col("Language"))).show()

## Group Target Variables

In [37]:
assembler = VectorAssembler(inputCols=["MaritalStatusID","GenderID", "EmpStatusID", "DeptID", "PerfScoreID", "FromDiversityJobFairID", "Termd",
                                       "PositionID", "ManagerID"], outputCol="features")
hr_transformed_data = assembler.transform(hr_data_cleaned)

In [38]:
hr_transformed_data.select("features","Salary").show()

+--------------------+------+
|            features|Salary|
+--------------------+------+
|[1.0,1.0,5.0,3.0,...|104437|
|[1.0,0.0,5.0,5.0,...| 64955|
|[2.0,0.0,5.0,5.0,...| 50825|
|[1.0,0.0,5.0,5.0,...| 54670|
|[1.0,1.0,5.0,5.0,...| 47211|
|[2.0,1.0,5.0,5.0,...| 52505|
|[0.0,1.0,4.0,5.0,...| 57834|
|[0.0,0.0,4.0,3.0,...|110000|
|[0.0,0.0,5.0,5.0,...| 57815|
|[0.0,1.0,4.0,3.0,...|103613|
|[0.0,1.0,5.0,5.0,...| 74312|
|[1.0,0.0,5.0,5.0,...| 53492|
|[1.0,0.0,5.0,6.0,...| 74326|
|[0.0,0.0,5.0,5.0,...| 64786|
|[1.0,0.0,5.0,5.0,...| 64066|
|[1.0,1.0,5.0,5.0,...| 59369|
|[0.0,1.0,4.0,5.0,...| 59144|
|[1.0,1.0,5.0,5.0,...| 55722|
|[3.0,0.0,5.0,5.0,...| 58275|
|[1.0,0.0,5.0,5.0,...| 60070|
+--------------------+------+
only showing top 20 rows



## Data Split

In [39]:
(training_data, testing_data) = hr_transformed_data.randomSplit([0.8, 0.2], seed=42)

## Model Building

In [40]:
lr = LinearRegression(featuresCol="features", labelCol="Salary")
model = lr.fit(training_data)

## Model Evaluation

In [41]:
predictions = model.transform(testing_data)

In [42]:
evaluator = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("R Squared =", r2)

R Squared = 0.5789676362279946


In [43]:
evaluator = RegressionEvaluator(labelCol="Salary", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE =", rmse)

RMSE = 11404.800500086089


## Stop the Spark session

In [None]:
spark.stop()