In [2]:
import findspark

findspark.init()

In [None]:
import unittest
import os
import re
import json
import pandas as pd
import matplotlib.pyplot as plt

from unittest.mock import MagicMock, patch
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, to_json, udf, explode, avg, count, trim, lower, length, max as spark_max
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, DoubleType, ArrayType,
    LongType, FloatType, DateType, TimestampType
)

# Initialize Spark session
spark = SparkSession.builder.master("local[*]").appName("TestSession").getOrCreate()


In [None]:
# Create dummy DataFrame
data = [("Agency A", "Software Engineer", "Civil Title", "Tech", "2023-06-01", 60000, 80000, "Bachelor's degree required", "Python; SQL")]
schema = StructType([
    StructField("agency", StringType(), True),
    StructField("businessTitle", StringType(), True),
    StructField("civilServiceTitle", StringType(), True),
    StructField("jobCategory", StringType(), True),
    StructField("postingDate", StringType(), True),
    StructField("salaryRangeFrom", IntegerType(), True),
    StructField("salaryRangeTo", IntegerType(), True),
    StructField("minimumQualRequirements", StringType(), True),
    StructField("preferredSkills", StringType(), True)
])
df = spark.createDataFrame(data, schema)


In [None]:

class TestJobPipelineComponents(unittest.TestCase):

    def test_spark_factory(self):
        session = SparkFactory.create("TestApp")
        self.assertIsInstance(session, SparkSession)

    def test_job_data_extractor(self):
        path = "/tmp/test_input.csv"
        df.toPandas().to_csv(path, index=False)
        extractor = JobDataExtractor(spark, path)
        result = extractor.read()
        self.assertIsNotNone(result)
        self.assertGreater(len(result.columns), 0)

    def test_job_data_wrangler(self):
        wrangler = JobDataWrangler(df)
        result = wrangler.sanitize_column_names()
        for col_name in result.columns:
            self.assertFalse(re.search(r"[^\w]", col_name))

    def test_job_data_transformer(self):
        transformer = JobDataTransformer(df)
        result = transformer.transform()
        self.assertIn("avgSalary", result.columns)
        self.assertIn("degreeLevel", result.columns)
        self.assertIn("skillsJson", result.columns)

    def test_job_data_validator(self):
        transformer = JobDataTransformer(df)
        transformed = transformer.transform()
        validator = JobDataValidator(transformed)
        validated = validator.validate()
        self.assertTrue(validated.count() > 0)

    def test_job_data_loader(self):
        transformer = JobDataTransformer(df)
        transformed = transformer.transform()
        loader = JobDataLoader(transformed)
        output_path = "/tmp/test_output"
        loader.write(output_path)
        self.assertTrue(os.path.exists(output_path))

    def test_job_data_profiler(self):
        profiler = JobDataProfiler(df)
        profiler.detect_column_types()
        self.assertGreaterEqual(len(profiler.categorical_cols + profiler.numerical_cols + profiler.date_cols), 1)


In [None]:

unittest.TextTestRunner().run(unittest.defaultTestLoader.loadTestsFromTestCase(TestJobPipelineComponents))
