In [1]:
import re
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, regexp_extract
from pyspark.sql.types import StringType, ArrayType, StructType, StructField
import json

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ResumeParsing") \
    .master("spark://dbms-spark-master:7077") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0,org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://dbms-minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minio_user") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio_password") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

In [3]:
# Function Definitions
def clean_text(text):
    """Remove unwanted characters and trim the string."""
    if text:
        text = str(text)
        return re.sub(r'[^a-zA-Z0-9\s,.-]', '', text).strip()
    return None

def extract_education(input_string):
    """Extract the Education Details section."""
    return input_string
    if not input_string:
        return None
    education_section = re.findall(r"Education Details\s*(.*?)\s*(?=Skill Details|$)", input_string, re.S)
    return clean_text(education_section[0]) if education_section else None

def extract_skills_details(text):
    """Extract skill details as a list."""
    if not text:
        return []
    text = str(text)
    skills_section = re.findall(r"Skill Details\s+([\s\S]+?)Company Details", text)
    if skills_section:
        skills = re.findall(r"([a-zA-Z]+)- Exprience", skills_section[0])
        return [skill.lower() for skill in skills]
    return []

def extract_company_details(input_string):
    """Extract company details."""
    if not input_string:
        return None
    input_string = str(input_string)
    company_section = re.search(r"Company Details\s+(.*?)(?=TECHNOLOGY ASSISTED REVIEW|$)", input_string, re.S)
    if company_section:
        company_details = {}
        for line in company_section.group(1).splitlines():
            if " - " in line:
                key, value = line.split(" - ", 1)
                company_details[key.strip().lower()] = value.strip()
        return company_details.get('company')
    return None

def extract_projects(input_string):
    """Extract project details."""
    if not input_string:
        return []
    input_string = str(input_string)
    projects = re.findall(r"(?<=\n)\* (.*?)\nTools & Technologies: (.*?)\n", input_string, re.S)
    return [{"Project": clean_text(project.strip()), "Tools": clean_text(tools.strip())} for project, tools in projects]

In [4]:
# Register UDFs
clean_text_udf = udf(clean_text, StringType())
extract_education_udf = udf(extract_education, StringType())
extract_skills_udf = udf(extract_skills_details, ArrayType(StringType()))
extract_company_udf = udf(extract_company_details, StringType())
extract_projects_udf = udf(extract_projects, ArrayType(StructType([
    StructField("Project", StringType(), True),
    StructField("Tools", StringType(), True)
])))

In [5]:
# Read CSV into DataFrame
input_file = "text_cv/ResumeDataSet.csv"
minio_bucket = "raw-bucket"
csv_path = f"s3a://{minio_bucket}/{input_file}"
df = spark.read\
    .option("delimiter", ",")\
    .option("encoding", "UTF-8")\
    .option("escape", "\"") \
    .option("multiline", "true") \
    .csv(csv_path, header=True, inferSchema=True)

df.count()

962

In [6]:
# Apply transformations
transformed_df = df \
    .withColumn("Skill", extract_skills_udf("Resume")) \
    .withColumn("Company", extract_company_udf("Resume")) \
    .withColumn("Project", extract_projects_udf("Resume"))

# Select specific columns
# cleaned_df = transformed_df.select("Category", "Skill", "Company", "Project")
transformed_df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
delta_table = "unified_cv"
cleaned_minio_bucket='cleaned-bucket'
table_location = f"s3a://{cleaned_minio_bucket}/unified/{delta_table}"
transformed_df.write.format("delta").mode("append").save(table_location)

In [9]:
delta_df = spark.read.format("delta").load(table_location)
delta_df.show()

+--------------------+--------------------+--------------------+--------------------+
|              Resume|               Skill|             Company|             Project|
+--------------------+--------------------+--------------------+--------------------+
|Skills * Programm...|[javascript, jque...|   Ernst & Young LLP|[{Core member of ...|
|Education Details...|[python, statsmod...|            Matelabs|                  []|
|Areas of Interest...|[analysis, excel,...|      THEMATHCOMPANY|                  []|
|Skills â?¢ R â?¢ ...|[programming, ret...|        Deloitte USI|                  []|
|Education Details...|[structure, c, an...|          Itechpower|                  []|
|SKILLS C Basics, ...|[office, c, learn...|                    |                  []|
|Skills â?¢ Python...|[learning, python...|Heretic Solutions...|                  []|
|Education Details...|[numpy, learning,...|  Wipro Technologies|                  []|
|Personal Skills â...|                  []|Life Insura