In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, FloatType, IntegerType, DateType
import pyspark.sql.functions as F
import re
import yaml

#define path to your yaml file
yaml_file_path= 'config.yaml'

with open(yaml_file_path, 'r') as yaml_file:
    config = yaml.safe_load(yaml_file)

spark = SparkSession \
    .builder \
    .appName("final_project") \
    .master("local[*]")\
    .config("spark.executor.memory", "2g")\
    .config("spark.jars", config['spark']['path']) \
    .getOrCreate()

23/11/01 12:17:53 WARN Utils: Your hostname, FM-PC-LT-176 resolves to a loopback address: 127.0.1.1; using 172.16.5.4 instead (on interface wlo1)
23/11/01 12:17:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/11/01 12:17:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
def extract():
    try:
        # CSV path
        csv = config['csv']['path']
        # Read raw_data
        df = spark.read.csv(csv, header=True, inferSchema=False)
        return df
    except Exception as e:
        raise Exception(f"An error occurred during data extraction: {str(e)}")
        spark.stop()

In [3]:
df=extract()
df.show(3)

+---------------+-------------+--------------+------------+--------+---------------+--------+---------+---------+------------+----------------+----------+--------------+----------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         Job Id|   experience|qualifications|Salary Range|location|        country|latitude|longitude|Work Type|Company Size|Job Posting Date|preference|Contact Person|         contact|           Job Title|                Role|         Job Portal|     Job Description|            benefits|              skills|    responsibilities|             company|     Company Profile|
+---------------+-------------+--------------+------------+--------+---------------+--------+---------+---------+------------+----------------+----------+--------------+----------------+--------------------+--------------------+-------------------+--

In [2]:
def clean():
    try:
        df = extract()
        # Changing -ve company size to positive
        df = df.withColumn("Company Size", F.when(F.col("Company Size") < 0, -F.col("Company Size")).otherwise(F.col("Company Size")))
        
        # List of columns to drop
        dropped = ["Contact Person", "Contact", "Benefits", "Company Profile"]
        # Drop columns
        df = df.drop(*dropped)
        
        # Changing to standard data types
        df = df.withColumn("Job Id", df["Job Id"].cast(LongType()))\
               .withColumn("latitude", df["latitude"].cast(FloatType()))\
               .withColumn("longitude", df["longitude"].cast(FloatType()))\
               .withColumn("Company Size", df["Company Size"].cast(IntegerType()))\
               .withColumn("Job Posting Date", df["Job Posting Date"].cast(DateType()))
        
        return df
    except Exception as e:
        raise Exception(f"An error occurred during data cleaning: {str(e)}")
        spark.stop()

In [21]:
#udf to calculate avg
def calculate_average(range_str):
    # Use regular expression to extract numbers
    numbers = re.findall(r'\d+', range_str)
    if len(numbers) == 2:
        lower = int(numbers[0])
        upper = int(numbers[1])
        avg = (lower + upper) / 2
        return avg
    else:
        return None


In [3]:
def transform():
    try:
        df=clean()
        calculate_average_udf = F.udf(calculate_average)
        # Add a new column with the calculated average
        new_df = df.withColumn("Average", calculate_average_udf(df["Salary Range"]))
        new_df.withColumnRenamed("Average","Average Salary").printSchema()
        
        # Changing gender preference from "both" to "Male or Female"
        new_df = new_df.withColumn("Preference", when(col("Preference") == "Both", "Male or Female").otherwise(col("Preference")))
        new_df.show()

        #Dividing companies into tiers according to the company size

        quartiles = df.stat.approxQuantile("company size", [0.25, 0.75], 0.0)
        q1, q3 = quartiles
        new_df = new_df.withColumn("CompanyTier",
        when(col("company size") <= q1, "Tier-1 (Low)")
        .when((col("company size") > q1) & (col("company size") <= q3), "Tier-2 (Medium)")
        .when(col("company size") > q3, "Tier-3 (High)")
        .otherwise("Uncategorized"))


        #Dividing jobs into three categories according to their salary.
        
        quartiles = df.stat.approxQuantile("salary", [0.25, 0.75], 0.0)
        q1, q3 = quartiles
        updatedDF = df.withColumn("SalaryLevel",
        when(col("salary") <= q1, "Low Pay")
        .when((col("salary") > q1) & (col("salary") <= q3), "Average Pay")
        .when(col("salary") > q3, "High Pay")
        .otherwise("Uncategorized"))
        

        # Differentiate qualifications according to their initials

        new_df = new_df.withColumn("QualificationCategory", when(col("Qualification").startswith("M"), "Masters")
        .when(col("Qualification").startswith("B"), "Bachelors")
        .when(col("Qualification").startswith("P"), "PhD")
        .otherwise("Uncategorized"))


        return new_df

    except Exception as e:
        raise Exception(f"An error occurred during data transformation: {str(e)}")
        spark.stop()

In [4]:
def load():
    try:
        new_df=transform()
        ##Load the clean data in postgres
        new_df.write.format('jdbc').options(url=config['postgres']['url'],driver = config['postgres']['driver'], dbtable = config['postgres']['dbtable'], user=config['postgres']['user'],password=config['postgres']['password']).mode('overwrite').save()
        return new_df
    except Exception as e:
        raise Exception(f"An error occurred during loading the data: {str(e)}")
        spark.stop()    
