In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, FloatType, IntegerType, DateType
import pyspark.sql.functions as F
import re



spark = SparkSession \
    .builder \
    .appName("final_project") \
    .master("local[*]")\
    .config("spark.executor.memory", "2g")\
    .config("spark.jars", "/usr/lib/jvm/java-11-openjdk-amd64/lib/postgresql-42.5.0.jar") \
    .getOrCreate()

In [27]:
def extract():
        
    #csv path
    csv = "/home/ubuntu/Desktop/final_project/Raw_Data/job_descriptions.csv"
    #Read raw_data
    df = spark.read.csv(csv, header=True, inferSchema=False)
    return df


In [28]:
#udf to calculate avg
def calculate_average(range_str):
    # Use regular expression to extract numbers
    numbers = re.findall(r'\d+', range_str)
    if len(numbers) == 2:
        lower = int(numbers[0])
        upper = int(numbers[1])
        avg = (lower + upper) / 2
        return avg
    else:
        return None


In [29]:
def transform():
    df=extract()
    #List of columns to drop 
    dropped = ["Contact Person", "Contact", "Benefits","Company Profile"]
    #Drop column
    df = df.drop(*dropped)
    #changing to standard datatype
    df = df.withColumn("Job Id", df["Job Id"].cast(LongType()))\
        .withColumn("latitude", df["latitude"].cast(FloatType()))\
         .withColumn("longitude", df["longitude"].cast(FloatType()))\
          .withColumn("Company Size", df["Company Size"].cast(IntegerType()))\
           .withColumn("Job Posting Date", df["Job Posting Date"].cast(DateType()))
    calculate_average_udf = F.udf(calculate_average)

    # Add a new column with the calculated average
    new_df = df.withColumn("Average", calculate_average_udf(df["Salary Range"]))
    new_df.withColumnRenamed("Average","Average Salary").printSchema()
    return new_df

In [30]:
def load():
    new_df=transform()
    ##Load the clean data in postgres
    new_df.write.format('jdbc').options(url='jdbc:postgresql://localhost:5432/final_project',driver = 'org.postgresql.Driver', dbtable = 'job_description_clean', user="postgres",password="postgres" ).mode('overwrite').save()
    return new_df

In [36]:
dff=load()
dff.show(3)

23/10/27 16:08:20 INFO InMemoryFileIndex: It took 5 ms to list leaf files for 1 paths.
23/10/27 16:08:20 INFO InMemoryFileIndex: It took 1 ms to list leaf files for 1 paths.
23/10/27 16:08:20 INFO PythonUDFRunner: Times: total = 2668, boot = 77, init = 187, finish = 2404
23/10/27 16:08:20 INFO PythonUDFRunner: Times: total = 3141, boot = 210, init = 302, finish = 2629
23/10/27 16:08:20 INFO PythonUDFRunner: Times: total = 3006, boot = 105, init = 133, finish = 2768
23/10/27 16:08:20 INFO Executor: Finished task 7.0 in stage 16.0 (TID 80). 2216 bytes result sent to driver
23/10/27 16:08:20 INFO TaskSetManager: Finished task 7.0 in stage 16.0 (TID 80) in 16596 ms on 172.16.5.112 (executor driver) (1/16)
23/10/27 16:08:20 INFO Executor: Finished task 1.0 in stage 16.0 (TID 74). 2216 bytes result sent to driver
23/10/27 16:08:20 INFO TaskSetManager: Finished task 1.0 in stage 16.0 (TID 74) in 16608 ms on 172.16.5.112 (executor driver) (2/16)
23/10/27 16:08:20 INFO Executor: Finished task 3

root
 |-- Job Id: long (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Qualifications: string (nullable = true)
 |-- Salary Range: string (nullable = true)
 |-- location: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- Work Type: string (nullable = true)
 |-- Company Size: integer (nullable = true)
 |-- Job Posting Date: date (nullable = true)
 |-- Preference: string (nullable = true)
 |-- Job Title: string (nullable = true)
 |-- Role: string (nullable = true)
 |-- Job Portal: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- skills: string (nullable = true)
 |-- Responsibilities: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Average Salary: string (nullable = true)



23/10/27 16:08:21 INFO PythonUDFRunner: Times: total = 2565, boot = 12, init = 153, finish = 2400
23/10/27 16:08:21 INFO Executor: Finished task 6.0 in stage 16.0 (TID 79). 2216 bytes result sent to driver
23/10/27 16:08:21 INFO TaskSetManager: Finished task 6.0 in stage 16.0 (TID 79) in 17540 ms on 172.16.5.112 (executor driver) (16/16)
23/10/27 16:08:21 INFO TaskSchedulerImpl: Removed TaskSet 16.0, whose tasks have all completed, from pool 
23/10/27 16:08:21 INFO DAGScheduler: ResultStage 16 (save at NativeMethodAccessorImpl.java:0) finished in 17.595 s
23/10/27 16:08:21 INFO DAGScheduler: Job 13 is finished. Cancelling potential speculative or zombie tasks for this job
23/10/27 16:08:21 INFO TaskSchedulerImpl: Killing all running tasks in stage 16: Stage finished
23/10/27 16:08:21 INFO DAGScheduler: Job 13 finished: save at NativeMethodAccessorImpl.java:0, took 17.600803 s
23/10/27 16:08:22 INFO SparkContext: Starting job: save at NativeMethodAccessorImpl.java:0
23/10/27 16:08:22 IN

+----------------+-------------+--------------+------------+--------+----------------+--------+---------+---------+------------+----------------+----------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-------+
|          Job Id|   Experience|Qualifications|Salary Range|location|         Country|latitude|longitude|Work Type|Company Size|Job Posting Date|Preference|           Job Title|                Role|  Job Portal|     Job Description|              skills|    Responsibilities|             Company|Average|
+----------------+-------------+--------------+------------+--------+----------------+--------+---------+---------+------------+----------------+----------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-------+
|1089843540111562|5 to 15 Years|        M.Tech|   $59K-$99K| Douglas|     Isle of Man| 5

23/10/27 16:08:39 INFO Executor: 1 block locks were not released by task 0.0 in stage 19.0 (TID 106)
[rdd_13_0]
23/10/27 16:08:39 INFO Executor: Finished task 0.0 in stage 19.0 (TID 106). 4959 bytes result sent to driver
23/10/27 16:08:39 INFO TaskSetManager: Finished task 0.0 in stage 19.0 (TID 106) in 87 ms on 172.16.5.112 (executor driver) (1/1)
23/10/27 16:08:39 INFO TaskSchedulerImpl: Removed TaskSet 19.0, whose tasks have all completed, from pool 
23/10/27 16:08:39 INFO DAGScheduler: ResultStage 19 (showString at NativeMethodAccessorImpl.java:0) finished in 0.099 s
23/10/27 16:08:39 INFO DAGScheduler: Job 16 is finished. Cancelling potential speculative or zombie tasks for this job
23/10/27 16:08:39 INFO TaskSchedulerImpl: Killing all running tasks in stage 19: Stage finished
23/10/27 16:08:39 INFO DAGScheduler: Job 16 finished: showString at NativeMethodAccessorImpl.java:0, took 0.104658 s


In [32]:
df.persist()

23/10/27 16:06:15 WARN CacheManager: Asked to cache already cached data.


DataFrame[Job Id: string, Experience: string, Qualifications: string, Salary Range: string, location: string, Country: string, latitude: string, longitude: string, Work Type: string, Company Size: string, Job Posting Date: string, Preference: string, Contact Person: string, Contact: string, Job Title: string, Role: string, Job Portal: string, Job Description: string, Benefits: string, skills: string, Responsibilities: string, Company: string, Company Profile: string]