In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, FloatType, IntegerType, DateType
import pyspark.sql.functions as F
import re
import yaml

In [2]:
# Define the path to your YAML file
yaml_file_path = 'config.yaml'

# Read the YAML file and parse it into a Python dictionary
with open(yaml_file_path, 'r') as yaml_file:
    config = yaml.safe_load(yaml_file)

In [3]:
spark = SparkSession \
    .builder \
    .appName("final_project") \
    .config("spark.jars", config['spark']["path"]) \
    .getOrCreate()

23/10/31 11:20:25 WARN Utils: Your hostname, kushal-Latitude-E5440 resolves to a loopback address: 127.0.1.1; using 172.16.5.134 instead (on interface wlp2s0)
23/10/31 11:20:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/10/31 11:20:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
def extract():
        
    #csv path
    csv = config['csv']["path"]
    #Read raw_data
    df = spark.read.csv(csv, header=True, inferSchema=False)
    return df

In [5]:
#udf to calculate avg
def calculate_average(range_str):
    # Use regular expression to extract numbers
    numbers = re.findall(r'\d+', range_str)
    if len(numbers) == 2:
        lower = int(numbers[0])
        upper = int(numbers[1])
        avg = (lower + upper) / 2
        return avg
    else:
        return None


In [6]:
def transform():
    df=extract()
    #List of columns to drop 
    dropped = ["Contact Person", "Contact", "Benefits","Company Profile"]
    #Drop column
    df = df.drop(*dropped)
    #changing to standard datatype
    df = df.withColumn("Job Id", df["Job Id"].cast(LongType()))\
        .withColumn("latitude", df["latitude"].cast(FloatType()))\
         .withColumn("longitude", df["longitude"].cast(FloatType()))\
          .withColumn("Company Size", df["Company Size"].cast(IntegerType()))\
           .withColumn("Job Posting Date", df["Job Posting Date"].cast(DateType()))
    calculate_average_udf = F.udf(calculate_average)

    # Add a new column with the calculated average
    new_df = df.withColumn("Average", calculate_average_udf(df["Salary Range"]))
    new_df.withColumnRenamed("Average","Average Salary").printSchema()
    return new_df

In [7]:
def load():
    new_df=transform()
    ##Load the clean data in postgres
    new_df.write.format('jdbc').options(url=config['postgres']["url"],driver = config['postgres']["driver"], dbtable = config['postgres']["dbtable"], user=config['postgres']["user"],password=config['postgres']["password"]).mode('overwrite').save()
    return new_df

In [8]:
dff=load()
dff.show(3)

                                                                                

root
 |-- Job Id: long (nullable = true)
 |-- experience: string (nullable = true)
 |-- qualifications: string (nullable = true)
 |-- Salary Range: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- Work Type: string (nullable = true)
 |-- Company Size: integer (nullable = true)
 |-- Job Posting Date: date (nullable = true)
 |-- preference: string (nullable = true)
 |-- Job Title: string (nullable = true)
 |-- Role: string (nullable = true)
 |-- Job Portal: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- skills: string (nullable = true)
 |-- responsibilities: string (nullable = true)
 |-- company: string (nullable = true)
 |-- Average Salary: string (nullable = true)



                                                                                

+---------------+-------------+--------------+------------+--------+---------------+--------+---------+---------+------------+----------------+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------+
|         Job Id|   experience|qualifications|Salary Range|location|        country|latitude|longitude|Work Type|Company Size|Job Posting Date|preference|           Job Title|                Role|         Job Portal|     Job Description|              skills|    responsibilities|             company|Average|
+---------------+-------------+--------------+------------+--------+---------------+--------+---------+---------+------------+----------------+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------+
|750556001849337|1 to 15 Years|           MBA|   $61K-$85K|    Suva|     

In [32]:
dff.persist()

23/10/27 16:06:15 WARN CacheManager: Asked to cache already cached data.


DataFrame[Job Id: string, Experience: string, Qualifications: string, Salary Range: string, location: string, Country: string, latitude: string, longitude: string, Work Type: string, Company Size: string, Job Posting Date: string, Preference: string, Contact Person: string, Contact: string, Job Title: string, Role: string, Job Portal: string, Job Description: string, Benefits: string, skills: string, Responsibilities: string, Company: string, Company Profile: string]