In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, try_to_timestamp, row_number, sum as spark_sum
from pyspark.sql.window import Window

spark = SparkSession.builder \
    .config("spark.driver.memory", "12g") \
    .appName("Apache Jira Issues") \
    .getOrCreate()

spark.catalog.clearCache()

In [3]:
issues = spark.read.option("maxColumns", 100000).csv("./apache/issues.csv", header=True, inferSchema=True)

# rename all columns containing dot (.) with underscore (_)
for column in issues.columns:
    if '.' in column:
        issues = issues.withColumnRenamed(column, column.replace('.', '_'))

issues.printSchema()

root
 |-- id: string (nullable = true)
 |-- key: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- resolution_id: string (nullable = true)
 |-- resolution_description: string (nullable = true)
 |-- resolution_name: string (nullable = true)
 |-- priority_id: string (nullable = true)
 |-- priority_name: string (nullable = true)
 |-- labels: string (nullable = true)
 |-- assignee: string (nullable = true)
 |-- status_id: string (nullable = true)
 |-- status_description: string (nullable = true)
 |-- status_name: string (nullable = true)
 |-- statusCategory_id: string (nullable = true)
 |-- statusCategory_key: string (nullable = true)
 |-- statusCategory_colorName: string (nullable = true)
 |-- statusCategory_name: string (nullable = true)
 |-- customfield_12310921: string (nullable = true)
 |-- creator: string (nullable = true)
 |-- subtasks: string (nullable = true)
 |-- reporter: string (nullable = true)
 |-- votes_votes: string (nullable = true)
 |-- issuetype_id: st

In [4]:
# drop rows with both 'project_id' and 'project_key as null
issues_cleaned = issues.dropna(subset=['project_id', 'project_key'], how='any')

In [5]:
datetime_columns = [
    'resolutiondate',
    'created',
    'updated'
]

# convert datetime columns to timestamp type (invalid formats will become null)
for col_name in datetime_columns:
    issues_cleaned = issues_cleaned.withColumn(
        col_name,
        try_to_timestamp(col(col_name))  # Adjust format as needed
    )

numeric_columns = [
    "votes_votes",
    "watches_watchCount"
]

# convert numeric columns to integer type
for col_name in numeric_columns:
    issues_cleaned = issues_cleaned.withColumn(col_name, when(col(col_name).isNull(), 0).otherwise(col(col_name).try_cast("integer")))

In [None]:
resolution_groupings ={
    "Fixed": ["Fixed", "Implemented", "Delivered", "Resolved", "Done"],
    "Invalid": ["Invalid", "Not A Problem", "Not A Bug", "Cannot Reproduce"],
    "Won't Fix": ["Won't Fix", "Abandoned", "Later"],
    "Duplicate": ["Duplicate"],
    "Incomplete": ["Incomplete", "Pending Closed"],
    "Auto Closed": ["Auto Closed"],  
    "Workaround": ["Workaround", "Works For Me"],
    "Feedback": ["Feedback Received", "Information Provided"],
}
resolution_name_mapping = {name: group for group, names in resolution_groupings.items() for name in names}

status_category_mapping = {
    'Done': 'Done',
    'In Progress': 'In Progress',
    'To Do': 'To Do',
}

priority_groupings = {
    "High": ["Blocker", "Critical", "High", "Urgent"],
    "Medium": ["Major", "Normal"],
    "Low": ["Minor", "Trivial", "Low"]
}
priority_name_mapping = {name: group for group, names in priority_groupings.items() for name in names}

issue_type_groupings = {
    "Bug": ["Bug"],
    "Feature": ["New Feature", "Story", "Epic", "Wish"],
    "Improvement": ["Improvement", "Technical Debt", "Technical task"],
    "Task": ["Task", "Sub-task", "Test", "Dependency upgrade", "Dependency", "Umbrella"],
}
issue_type_mapping = {name: group for group, names in issue_type_groupings.items() for name in names}

In [7]:
issues_mapped = issues_cleaned \
    .withColumn("resolution_group", lit("Other")) \
    .withColumn("status_category", lit("Other")) \
    .withColumn("priority_group", lit("Other")) \
    .withColumn("issue_type_group", lit("Other"))

for name, group in resolution_name_mapping.items():
    issues_mapped = issues_mapped.withColumn(
        "resolution_group",
        when(col("resolution_name") == name, group).otherwise(col("resolution_group"))
    )

for name, group in status_category_mapping.items():
    issues_mapped = issues_mapped.withColumn(
        "status_category",
        when(col("statusCategory_name") == name, group).otherwise(col("status_category"))
    )

for name, group in priority_name_mapping.items():
    issues_mapped = issues_mapped.withColumn(
        "priority_group",
        when(col("priority_name") == name, group).otherwise(col("priority_group"))
    )

for name, group in issue_type_mapping.items():
    issues_mapped = issues_mapped.withColumn(
        "issue_type_group",
        when(col("issuetype_name") == name, group).otherwise(col("issue_type_group"))
    )

In [8]:
column_rename_mapping = {
    'id': 'id',
    'key': 'key',
    'project_id': 'project_id',
    'project_key': 'project_key',
    'project_name': 'project_name',
    'projectCategory_name': 'project_category',
    'created': 'created_at',
    'updated': 'updated_at',
    'resolutiondate': 'resolved_at',
    'creator': 'creator',
    'assignee': 'assignee',
    'reporter': 'reporter',
    'issue_type_group': 'issue_type',
    'status_name': 'status_name',
    'status_category': 'status_category',
    'priority_group': 'priority',
    'resolution_group': 'resolution',
    'votes_votes': 'votes',
    'watches_watchCount': 'watches'
}

issues_new = issues_mapped.select(
    *[col(col_name).alias(new_col_name) for col_name, new_col_name in column_rename_mapping.items()]
)
issues_new.printSchema()

root
 |-- id: string (nullable = true)
 |-- key: string (nullable = true)
 |-- project_id: string (nullable = true)
 |-- project_key: string (nullable = true)
 |-- project_name: string (nullable = true)
 |-- project_category: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- updated_at: timestamp (nullable = true)
 |-- resolved_at: timestamp (nullable = true)
 |-- creator: string (nullable = true)
 |-- assignee: string (nullable = true)
 |-- reporter: string (nullable = true)
 |-- issue_type: string (nullable = false)
 |-- status_name: string (nullable = true)
 |-- status_category: string (nullable = false)
 |-- priority: string (nullable = false)
 |-- resolution: string (nullable = false)
 |-- votes: integer (nullable = true)
 |-- watches: integer (nullable = true)



In [10]:
# save issues_new to csv (to pandas first)

# # Explicitly cast timestamp columns to string to avoid OSError on toPandas
timestamp_columns = ['created_at', 'updated_at', 'resolved_at']
for col_name in timestamp_columns:
    issues_new = issues_new.withColumn(col_name, col(col_name).cast("string"))

issues_new.toPandas().to_csv("./apache/issues_cleaned.csv", index=False)

In [11]:
spark.catalog.clearCache()
spark.stop()