In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, FloatType, IntegerType, DateType
import pyspark.sql.functions as F
import re
import yaml

#define path to your yaml file
yaml_file_path= 'config.yaml'

with open(yaml_file_path, 'r') as yaml_file:
    config = yaml.safe_load(yaml_file)

spark = SparkSession \
    .builder \
    .appName("final_project") \
    .master("local[*]")\
    .config("spark.executor.memory", "2g")\
    .config("spark.jars", config['spark']['path']) \
    .getOrCreate()

In [29]:
def extract(spark):
        
    #csv path
    csv = config['csv']['path']
    #Read raw_data
    df = spark.read.csv(csv, header=True, inferSchema=False)
    return df


In [31]:
def clean(spark):
    df=extract(spark)
    #changing -ve company size to positive
    df = df.withColumn("Company Size", F.when(F.col("Company Size") < 0, -F.col("Company Size")).otherwise(F.col("Company Size")))
    #List of columns to drop 
    dropped = ["Contact Person", "Contact", "Benefits","Company Profile"]
    #Drop column
    df = df.drop(*dropped)
    #changing to standard datatype
    df = df.withColumn("Job Id", df["Job Id"].cast(LongType()))\
        .withColumn("latitude", df["latitude"].cast(FloatType()))\
         .withColumn("longitude", df["longitude"].cast(FloatType()))\
          .withColumn("Company Size", df["Company Size"].cast(IntegerType()))\
           .withColumn("Job Posting Date", df["Job Posting Date"].cast(DateType()))
    return df


In [21]:
#udf to calculate avg
def calculate_average(range_str):
    # Use regular expression to extract numbers
    numbers = re.findall(r'\d+', range_str)
    if len(numbers) == 2:
        lower = int(numbers[0])
        upper = int(numbers[1])
        avg = (lower + upper) / 2
        return avg
    else:
        return None


In [33]:
def transform():
    df=extract()
    calculate_average_udf = F.udf(calculate_average)
    # Add a new column with the calculated average
    new_df = df.withColumn("Average", calculate_average_udf(df["Salary Range"]))
    new_df.withColumnRenamed("Average","Average Salary").printSchema()
    return new_df

In [17]:
def load():
    new_df=transform()
    ##Load the clean data in postgres
    new_df.write.format('jdbc').options(url=config['postgres']['url'],driver = config['postgres']['driver'], dbtable = config['postgres']['dbtable'], user=config['postgres']['user'],password=config['postgres']['password']).mode('overwrite').save()
    return new_df

In [34]:
df.persist()

23/10/31 15:09:47 INFO FileSourceStrategy: Pushed Filters: 
23/10/31 15:09:47 INFO FileSourceStrategy: Post-Scan Filters: 


DataFrame[Job Id: bigint, experience: string, qualifications: string, Salary Range: string, location: string, country: string, latitude: float, longitude: float, Work Type: string, Company Size: int, Job Posting Date: date, preference: string, Job Title: string, Role: string, Job Portal: string, Job Description: string, skills: string, responsibilities: string, company: string]