## NYC Job Analysis Program

In [None]:
import findspark

findspark.init()

### All Imports needed for Program

In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, year, avg, when, max as spark_max, regexp_replace, explode, split, lower, trim, count,length, to_json, from_json,array_distinct,countDistinct, sum as Fsum
from pyspark.sql.types import StructType, StructField,DoubleType, StringType, TimestampType, ArrayType, IntegerType, LongType, FloatType, DateType
import pandas as pd
import matplotlib.pyplot as plt
import logging
import os
import re
import unittest

### Logging Setup

In [4]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

### Spark Session Factory

In [5]:
class SparkFactory:
    """
    A factory class to create and configure a SparkSession for data processing.

    This class provides a static method to initialize a SparkSession with
    commonly used configurations for performance optimization.
    """

    @staticmethod
    def create(app_name="NYC Jobs Analysis"):
        """
        Create and return a SparkSession with specified configurations.

        Parameters:
        ----------
        app_name : str, optional
            Name of the Spark application (default is "NYC Jobs Analysis").

        Returns:
        -------
        pyspark.sql.SparkSession
            A configured SparkSession instance.
        """
        return (
            SparkSession.builder
            .appName(app_name)  # Set the name of the Spark application
            .config("spark.sql.adaptive.enabled", "true")  # Enable adaptive query execution
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")  # Use Kryo for efficient serialization
            .getOrCreate()  # Create or get existing SparkSession
        )


### Data Extraction

In [6]:
# Initialize logger for use in notebook (if not already set)
logger = logging.getLogger("JobDataExtractor")
if not logger.handlers:
    logging.basicConfig(level=logging.INFO)

class JobDataExtractor:
    """
    Extracts job data from a CSV file into a Spark DataFrame.
    
    Attributes:
    ----------
    spark : SparkSession
        An active SparkSession object.
    input_path : str
        Path to the input CSV file.
    """

    def __init__(self, spark, input_path):
        """
        Initialize the extractor with a SparkSession and file path.
        
        Parameters:
        ----------
        spark : SparkSession
            The Spark session to use for reading data.
        input_path : str
            The path to the CSV input file.
        """
        self.spark = spark
        self.input_path = input_path

    def read(self):
        """
        Reads data from the CSV file into a Spark DataFrame.

        Returns:
        -------
        pyspark.sql.DataFrame or None
            Returns a DataFrame if data is present; otherwise, None.
        """
        # Check if the input file is empty
        if os.stat(self.input_path).st_size == 0:
            logger.warning("Input file is empty.")
            return None

        # Read CSV with common options for messy/complex data
        df = (
            self.spark.read
            .option("header", "true")
            .option("inferSchema", "true")
            .option("multiLine", "true")  # Supports multiline records
            .option("quote", "\"")        # Handle quoted fields
            .option("escape", "\"")       # Handle escape characters
            .option("mode", "PERMISSIVE") # Tolerate corrupt lines
            .option("encoding", "ISO-8859-1")  # Handle special encoding
            .csv(self.input_path)
        )

        logger.info("Raw data loaded successfully.")
        return df


### Data Wrangling

In [7]:
class JobDataWrangler:
    """
    Cleans and standardizes column names in a Spark DataFrame.

    Attributes:
    ----------
    df : pyspark.sql.DataFrame
        The Spark DataFrame to be cleaned.
    """

    def __init__(self, df):
        """
        Initialize with a DataFrame to wrangle.

        Parameters:
        ----------
        df : pyspark.sql.DataFrame
            The DataFrame whose column names need sanitization.
        """
        self.df = df

    @staticmethod
    def sanitize_column_name(name):
        """
        Convert a column name to camelCase by:
        - Replacing non-alphanumeric characters with spaces
        - Applying camelCase formatting

        Parameters:
        ----------
        name : str
            The original column name.

        Returns:
        -------
        str
            The sanitized column name in camelCase.
        """
        name = re.sub(r'[^\w]', ' ', name)  # Replace non-word characters with space
        parts = name.strip().split()        # Split into words
        # Convert to camelCase (e.g., "Job Title" -> "jobTitle")
        return ''.join([parts[0].lower()] + [p.capitalize() for p in parts[1:]]) if parts else name

    def sanitize_column_names(self):
        """
        Sanitize all column names in the DataFrame using camelCase format.

        Returns:
        -------
        pyspark.sql.DataFrame
            The DataFrame with sanitized column names.
        """
        for old_name in self.df.columns:
            new_name = self.sanitize_column_name(old_name)
            # Rename only if the name changes to avoid unnecessary transformations
            if old_name != new_name:
                self.df = self.df.withColumnRenamed(old_name, new_name)
        return self.df


### Data Transformation

In [8]:
class JobDataTransformer:
    """
    Transforms raw job data by engineering new features like:
    - Average salary
    - Posting year
    - Classified degree levels
    - Parsed skills list
    """

    def __init__(self, df):
        """
        Initialize with a Spark DataFrame to be transformed.

        Parameters:
        ----------
        df : pyspark.sql.DataFrame
            The raw job data DataFrame.
        """
        self.df = df

    @staticmethod
    def classify_degree(text):
        """
        Classifies education level from text into standard categories.

        Parameters:
        ----------
        text : str
            The minimum qualification requirement text.

        Returns:
        -------
        str
            One of ["PhD", "Masters", "Bachelors", "Associate", "High School", "Other"]
        """
        if not text:
            return "Other"

        text = text.lower()
        patterns = [
            (r"(ph\.?d|doctorate|doctoral)", "PhD"),
            (r"(master'?s|m\.?a\.?|m\.?s\.?|mba)", "Masters"),
            (r"(bachelor'?s|b\.?a\.?|b\.?s\.?|baccalaureate)", "Bachelors"),
            (r"(associate'?s|a\.?a\.?|a\.?s\.?)", "Associate"),
            (r"(high school|h\.?s\.?|diploma|ged)", "High School"),
        ]

        for pattern, label in patterns:
            if re.search(pattern, text):
                return label

        return "Other"

    @staticmethod
    def extract_skills(text):
        """
        Extracts a list of possible skills from unstructured text.

        Parameters:
        ----------
        text : str
            The preferred skills text.

        Returns:
        -------
        list[str]
            A list of lowercase skill phrases.
        """
        if not text:
            return []

        # Normalize text and remove special characters
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)
        text = re.sub(r'[\u2022\u2023\u25E6\u2043\u2219]', ' ', text)
        text = text.replace('\t', ' ').replace('\n', ' ')

        # Split on sentence-like punctuation and bullets
        sentence_split = re.split(r'(?<=\d\.)|[.;\n\t•]|(?<!\d)\.\s+', text)
        skills = set()

        for sentence in sentence_split:
            sentence = sentence.strip().lower()
            # Further split into potential skill phrases
            clauses = re.split(r'[,-]', sentence)
            for clause in clauses:
                token = clause.strip()
                if len(token) > 2:
                    skills.add(token)

        return list(skills)

    def transform(self):
        """
        Applies transformations:
        - Computes average salary
        - Extracts posting year
        - Classifies degree level
        - Parses skills into JSON string

        Returns:
        -------
        pyspark.sql.DataFrame
            Transformed DataFrame with derived features.
        """
        df = self.df

        # Register UDFs
        degree_udf = udf(self.classify_degree, StringType())
        skills_udf = udf(self.extract_skills, ArrayType(StringType()))

        # Derived columns
        df = df.withColumn("avgSalary", (col("salaryRangeFrom") + col("salaryRangeTo")) / 2)
        df = df.withColumn("postingYear", year(col("postingDate")))
        df = df.withColumn("degreeLevel", degree_udf(col("minimumQualRequirements")))
        df = df.withColumn("skillsJson", to_json(skills_udf(col("preferredSkills"))))

        # Drop records with missing average salary
        df = df.dropna(subset=["avgSalary"])

        # Select relevant columns only (if they exist)
        selected_columns = [
            "agency", "businessTitle", "civilServiceTitle", "jobCategory", "postingDate",
            "salaryRangeFrom", "salaryRangeTo", "avgSalary", "postingYear", "degreeLevel", "skillsJson"
        ]
        df = df.select(*[c for c in selected_columns if c in df.columns])

        df.cache()
        logger.info("Data transformed successfully.")
        return df


### Data Validation

In [9]:
class JobDataValidator:
    """
    Validates cleaned job data by ensuring key fields are not null.

    Attributes:
    ----------
    df : pyspark.sql.DataFrame
        The Spark DataFrame to validate.
    """

    def __init__(self, df):
        """
        Initialize with a DataFrame to validate.

        Parameters:
        ----------
        df : pyspark.sql.DataFrame
            The DataFrame to run validation checks on.
        """
        self.df = df

    def validate(self):
        """
        Filters the DataFrame to retain only rows with required non-null fields.

        Returns:
        -------
        pyspark.sql.DataFrame
            A validated DataFrame with non-null 'avgSalary' and 'agency'.
        """
        return (
            self.df
            .filter(col("avgSalary").isNotNull())  # Ensure salary is present
            .filter(col("agency").isNotNull())     # Ensure agency is present
        )


### Data Loader

In [10]:
class JobDataLoader:
    """
    Handles writing the transformed and validated job data to disk.

    Attributes:
    ----------
    df : pyspark.sql.DataFrame
        The Spark DataFrame to be saved.
    """

    def __init__(self, df):
        """
        Initialize with the DataFrame to be written.

        Parameters:
        ----------
        df : pyspark.sql.DataFrame
            The DataFrame to output.
        """
        self.df = df

    def write(self, path):
        """
        Write the DataFrame to the specified path as a CSV file.

        Parameters:
        ----------
        path : str
            Output directory path for the CSV file.
        """
        # Coalesce to 1 partition to ensure single output file (suitable for smaller datasets)
        self.df.coalesce(1) \
               .write \
               .mode("overwrite") \
               .option("header", "true") \
               .csv(path)

        logger.info(f"Refined data written to {path}")

### Data Profiler

In [11]:
class JobDataProfiler:
    """
    Profiles a Spark DataFrame for:
    - Schema printing
    - Null value analysis
    - Unique value counts
    - Data type classification

    Attributes:
    ----------
    df : pyspark.sql.DataFrame
        The DataFrame to profile.
    numerical_cols : list
        List of columns inferred as numerical.
    categorical_cols : list
        List of columns inferred as categorical.
    date_cols : list
        List of columns inferred as dates.
    """

    def __init__(self, df):
        """
        Initialize the profiler with a Spark DataFrame.

        Parameters:
        ----------
        df : pyspark.sql.DataFrame
            The DataFrame to analyze.
        """
        self.df = df
        self.numerical_cols = []
        self.categorical_cols = []
        self.date_cols = []

    def print_schema(self):
        """
        Prints the schema of the DataFrame.
        """
        print("Schema:")
        self.df.printSchema()

    def count_nulls(self):
        """
        Prints the count of null values for each column.
        """
        print("Null Counts:")
        null_counts = self.df.select([
            Fsum(col(c).isNull().cast("int")).alias(c) for c in self.df.columns
        ])
        null_counts.show(truncate=False)

    def count_unique(self):
        """
        Prints the count of unique values for each column.
        """
        print("Unique Value Counts:")
        for c in self.df.columns:
            distinct_count = self.df.select(countDistinct(col(c))).collect()[0][0]
            print(f"{c}: {distinct_count} unique values")

    def detect_column_types(self):
        """
        Classifies columns into numerical, categorical, and date types.

        Outputs:
        -------
        Lists of column names by type are printed.
        """
        print("Column Type Classification:")
        for field in self.df.schema.fields:
            if isinstance(field.dataType, (DoubleType, IntegerType, LongType, FloatType)):
                self.numerical_cols.append(field.name)
            elif isinstance(field.dataType, (DateType, TimestampType)):
                self.date_cols.append(field.name)
            else:
                self.categorical_cols.append(field.name)

        print(f"Categorical Columns: {self.categorical_cols}")
        print(f"Numerical Columns: {self.numerical_cols}")
        print(f"Date Columns: {self.date_cols}")

    def run_all(self):
        """
        Runs all profiling steps in sequence.
        """
        self.print_schema()
        self.count_nulls()
        self.count_unique()
        self.detect_column_types()


### KPI Generator

In [12]:
class JobKPI:
    """
    Computes key performance indicators (KPIs) from a transformed Spark DataFrame
    containing job listings and derived fields like avgSalary, degreeLevel, skillsJson, etc.
    
    Attributes:
    ----------
    df : pyspark.sql.DataFrame
        The Spark DataFrame on which KPI calculations will be run.
    """

    def __init__(self, df):
        """
        Initialize with a Spark DataFrame.

        Parameters:
        ----------
        df : pyspark.sql.DataFrame
            The DataFrame to compute KPIs from.
        """
        self.df = df

    def jobs_per_category(self):
        """
        Returns the top 10 job categories by job posting count.

        Returns:
        -------
        pyspark.sql.DataFrame
        """
        return (
            self.df.groupBy("jobCategory")
            .count()
            .orderBy(col("count").desc())
            .limit(10)
        )

    def salary_distribution(self):
        """
        Computes average salary per job category.

        Returns:
        -------
        pyspark.sql.DataFrame
        """
        return (
            self.df.groupBy("jobCategory")
            .agg(avg("avgSalary").alias("avgSalary"))
            .orderBy(col("avgSalary").desc())
        )

    def degree_vs_salary(self):
        """
        Computes average salary by required degree level.

        Returns:
        -------
        pyspark.sql.DataFrame
        """
        return (
            self.df.groupBy("degreeLevel")
            .agg(avg("avgSalary").alias("avgSalary"))
            .orderBy(col("avgSalary").desc())
        )

    def highest_salary_per_agency(self):
        """
        Finds the highest average salary offered per agency.

        Returns:
        -------
        pyspark.sql.DataFrame
        """
        return (
            self.df.withColumn("avgSalary", col("avgSalary").cast(DoubleType()))
            .groupBy("agency")
            .agg(spark_max("avgSalary").alias("maxSalary"))
            .orderBy(col("maxSalary").desc())
        )

    def avg_salary_last_2_years(self):
        """
        Calculates the average salary per agency over the last 2 posting years.

        Returns:
        -------
        pyspark.sql.DataFrame
        """
        # Find latest year in the dataset
        latest_year = self.df.select(spark_max("postingYear").alias("latest_year")).collect()[0]["latest_year"]

        return (
            self.df.filter(col("postingYear") >= (latest_year - 1))
            .groupBy("agency")
            .agg(avg("avgSalary").alias("avgSalary"))
            .orderBy(col("avgSalary").desc())
        )

    def highest_paid_skills(self):
        """
        Identifies top 10 skills with highest average salary.

        Returns:
        -------
        pyspark.sql.DataFrame
        """
        # Parse the skills JSON array and normalize
        df_skills = (
            self.df.withColumn("skillsArray", from_json(col("skillsJson"), ArrayType(StringType())))
            .filter(col("skillsArray").isNotNull())
            .withColumn("skill", explode(col("skillsArray")))
            .withColumn("skill", trim(lower(col("skill"))))
            .filter((col("skill") != "") & (length(col("skill")) > 2))
        )

        # Compute average salary and usage count per skill
        top_skills = (
            df_skills.groupBy("skill")
            .agg(
                avg("avgSalary").alias("avgSalary"),
                count("skill").alias("skillCount")
            )
            .orderBy(col("avgSalary").desc())
            .limit(10)
        )

        return top_skills


### Visualization

In [13]:
class JobVisualizer:
    """
    Generates bar chart visualizations from Spark DataFrames using Matplotlib.

    Attributes:
    ----------
    output_dir : str
        Directory where the charts will be saved.
    """

    def __init__(self, output_dir="/dataset/output/charts/"):
        """
        Initializes the visualizer and ensures the output directory exists.

        Parameters:
        ----------
        output_dir : str, optional
            Directory path for saving plots. Defaults to '/dataset/output/charts/'.
        """
        os.makedirs(output_dir, exist_ok=True)
        self.output_dir = output_dir

    def plot_bar(self, df, x_col, y_col, title, filename):
        """
        Generates a horizontal bar plot from a Spark DataFrame.

        Parameters:
        ----------
        df : pyspark.sql.DataFrame
            The input DataFrame to visualize.
        x_col : str
            Column name for the y-axis (categories).
        y_col : str
            Column name for the x-axis (numeric values).
        title : str
            Title of the chart.
        filename : str
            Output file name for saving the chart (PNG format).
        """

        # Convert Spark DataFrame to Pandas for visualization
        pd_df = df.toPandas()

        # Ensure y_col is numeric and drop missing values
        pd_df[y_col] = pd.to_numeric(pd_df[y_col], errors='coerce')
        pd_df = pd_df.dropna(subset=[y_col])

        if pd_df.empty:
            logger.warning(f"No data to plot for {title}")
            return

        # Dynamic sizing
        num_bars = len(pd_df[x_col].unique())
        fig_height = max(6, num_bars * 0.5)
        fig_width = 20

        fig, ax = plt.subplots(figsize=(fig_width, fig_height))

        # Create horizontal bar chart
        bars = pd_df.plot.barh(x=x_col, y=y_col, ax=ax, legend=False, color='skyblue')

        ax.set_title(title, fontsize=16)
        ax.tick_params(axis='y', labelsize=12)

        # Adjust plot layout for visibility
        plt.subplots_adjust(left=0.3, right=0.95)

        # X-axis limit and labels
        xmax = pd_df[y_col].max()
        ax.set_xlim(0, xmax * 1.1)

        # Annotate bars with values
        for i, v in enumerate(pd_df[y_col]):
            ax.text(v + xmax * 0.01, i, f"{v:.2f}", color='black', va='center', fontsize=10)

        # Save chart to file
        path = os.path.join(self.output_dir, filename)
        fig.savefig(path, bbox_inches='tight')
        plt.close(fig)

        logger.info(f"Chart saved: {path}")


### Raw Data Extractor

In [14]:
input_path = "/dataset/nyc-jobs.csv"
refined_path =  "/dataset/output/refineddata"

spark = SparkFactory.create()
extractor = JobDataExtractor(spark, input_path)
raw_df = extractor.read()

INFO:JobDataExtractor:Raw data loaded successfully.


### Data Profiling

In [15]:
if raw_df is not None:
    profiler = JobDataProfiler(raw_df)
    profiler.run_all()

Schema:
root
 |-- Job ID: integer (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/Shift: string (nullable = true)
 |-- Wo

### Data Wrangling

In [16]:
    wrangler = JobDataWrangler(raw_df)
    df_cleaned = wrangler.sanitize_column_names()

### Calling Data Enrichment Utility

In [17]:
    transformer = JobDataTransformer(df_cleaned)
    df_transformed = transformer.transform()

INFO:JobDataExtractor:Data transformed successfully.


### Calling Data Validator Utility

In [19]:
    validator = JobDataValidator(df_transformed)
    validated_df = validator.validate()

### Calling Data Loader Utility

In [20]:
    loader = JobDataLoader(validated_df)
    loader.write(refined_path)

INFO:JobDataExtractor:Refined data written to /dataset/output/refineddata


### Calling Data Visualizer Utility

In [21]:
    kpi = JobKPI(validated_df)
    visualizer = JobVisualizer()

### Generate charts

In [22]:
    visualizer.plot_bar(kpi.jobs_per_category(), "jobCategory", "count", "Top 10 Job Categories", "jobs_per_category.png")

INFO:JobDataExtractor:Chart saved: /dataset/output/charts/jobs_per_category.png


In [23]:
    visualizer.plot_bar(kpi.salary_distribution(), "jobCategory", "avgSalary", "Avg Salary by Job Category", "salary_distribution.png")

INFO:JobDataExtractor:Chart saved: /dataset/output/charts/salary_distribution.png


In [24]:
    visualizer.plot_bar(kpi.degree_vs_salary(), "degreeLevel", "avgSalary", "Avg Salary by Degree Level", "degree_vs_salary.png")

INFO:JobDataExtractor:Chart saved: /dataset/output/charts/degree_vs_salary.png


In [25]:
    visualizer.plot_bar(kpi.highest_salary_per_agency(), "agency", "maxSalary", "Max Salary per Agency", "max_salary.png")

INFO:JobDataExtractor:Chart saved: /dataset/output/charts/max_salary.png


In [26]:
    visualizer.plot_bar(kpi.avg_salary_last_2_years(), "agency", "avgSalary", "Avg Salary (Last 2 Years)", "avg_salary_2y.png")

INFO:JobDataExtractor:Chart saved: /dataset/output/charts/avg_salary_2y.png


In [27]:
    visualizer.plot_bar(kpi.highest_paid_skills(), "skill", "avgSalary", "Top 20 Highest Paid Skills", "highest_paid_skills.png")

INFO:JobDataExtractor:Chart saved: /dataset/output/charts/highest_paid_skills.png


## Unit Test Cases

In [30]:
# Create dummy DataFrame
data = [("Agency A", "Software Engineer", "Civil Title", "Tech", "2023-06-01", 60000, 80000, "Bachelor's degree required", "Python; SQL")]
schema = StructType([
    StructField("agency", StringType(), True),
    StructField("businessTitle", StringType(), True),
    StructField("civilServiceTitle", StringType(), True),
    StructField("jobCategory", StringType(), True),
    StructField("postingDate", StringType(), True),
    StructField("salaryRangeFrom", IntegerType(), True),
    StructField("salaryRangeTo", IntegerType(), True),
    StructField("minimumQualRequirements", StringType(), True),
    StructField("preferredSkills", StringType(), True)
])
df = spark.createDataFrame(data, schema)

In [37]:

class TestJobPipelineComponents(unittest.TestCase):

    def test_spark_factory(self):
        session = SparkFactory.create("TestApp")
        self.assertIsInstance(session, SparkSession)

    def test_job_data_extractor(self):
        path = "/dataset/nyc-jobs.csv"
        df.toPandas().to_csv(path, index=False)
        extractor = JobDataExtractor(spark, path)
        result = extractor.read()
        self.assertIsNotNone(result)
        self.assertGreater(len(result.columns), 0)

    def test_job_data_wrangler(self):
        wrangler = JobDataWrangler(df)
        result = wrangler.sanitize_column_names()
        for col_name in result.columns:
            self.assertFalse(re.search(r"[^\w]", col_name))

    def test_job_data_transformer(self):
        transformer = JobDataTransformer(df)
        result = transformer.transform()
        self.assertIn("avgSalary", result.columns)
        self.assertIn("degreeLevel", result.columns)
        self.assertIn("skillsJson", result.columns)

    def test_job_data_validator(self):
        transformer = JobDataTransformer(df)
        transformed = transformer.transform()
        validator = JobDataValidator(transformed)
        validated = validator.validate()
        self.assertTrue(validated.count() > 0)

    def test_job_data_loader(self):
        transformer = JobDataTransformer(df)
        transformed = transformer.transform()
        loader = JobDataLoader(transformed)
        output_path = "/dataset/output/testdata"
        loader.write(output_path)
        self.assertTrue(os.path.exists(output_path))

    def test_job_data_profiler(self):
        profiler = JobDataProfiler(df)
        profiler.detect_column_types()
        self.assertGreaterEqual(len(profiler.categorical_cols + profiler.numerical_cols + profiler.date_cols), 1)


In [38]:
class TestJobKPI(unittest.TestCase):
    def setUp(self):
        transformer = JobDataTransformer(df)
        self.transformed_df = transformer.transform()
        self.kpi = JobKPI(self.transformed_df)

    def test_jobs_per_category(self):
        result = self.kpi.jobs_per_category()
        self.assertIn("jobCategory", result.columns)
        self.assertIn("count", result.columns)

    def test_salary_distribution(self):
        result = self.kpi.salary_distribution()
        self.assertIn("jobCategory", result.columns)
        self.assertIn("avgSalary", result.columns)

    def test_degree_vs_salary(self):
        result = self.kpi.degree_vs_salary()
        self.assertIn("degreeLevel", result.columns)
        self.assertIn("avgSalary", result.columns)

    def test_highest_salary_per_agency(self):
        result = self.kpi.highest_salary_per_agency()
        self.assertIn("agency", result.columns)
        self.assertIn("maxSalary", result.columns)

    def test_avg_salary_last_2_years(self):
        result = self.kpi.avg_salary_last_2_years()
        self.assertIn("agency", result.columns)
        self.assertIn("avgSalary", result.columns)

    def test_highest_paid_skills(self):
        result = self.kpi.highest_paid_skills()
        self.assertIn("skill", result.columns)
        self.assertIn("avgSalary", result.columns)
        self.assertIn("skillCount", result.columns)

In [39]:
unittest.TextTestRunner().run(unittest.defaultTestLoader.loadTestsFromTestCase(TestJobPipelineComponents))

INFO:JobDataExtractor:Raw data loaded successfully.
.INFO:JobDataExtractor:Data transformed successfully.
INFO:JobDataExtractor:Refined data written to /dataset/output/testdata
..

Column Type Classification:
Categorical Columns: ['agency', 'businessTitle', 'civilServiceTitle', 'jobCategory', 'postingDate', 'minimumQualRequirements', 'preferredSkills']
Numerical Columns: ['salaryRangeFrom', 'salaryRangeTo']
Date Columns: []


INFO:JobDataExtractor:Data transformed successfully.
.INFO:JobDataExtractor:Data transformed successfully.
...
----------------------------------------------------------------------
Ran 7 tests in 23.721s

OK


<unittest.runner.TextTestResult run=7 errors=0 failures=0>