In [0]:
df = spark.sql("""
          select * from motar_asml.logging.cpdt_logging_silver
          """)

In [0]:
# Determine the features (feature_name column)
from pyspark.sql.functions import when, col, lit

feature_dict = {
    "Classic" : ['MATLAB'],
    "Vcraft" : ['vcraft'],
    "Python remote debugging": ['devbenchExtension', 'Debugger attached successfully'],
    "Python remote debugging old": ['cidtDebugSession', 'Cidt debug session initialized'],
    "Devbench sync": ['devbenchExtension', 'Adding filewatcher for'],
    "Devbench sync old": ['devbenchExtension', 'Performing post-copy actions'],
    "Devbench integration": ['devbenchExtension', 'Devbench','successfully created'],
    "Report preview": ['reportPreviewExtension'],
    "Report editor": ['reportEditor'],
    "DDF syntax checking": ['ddfCheckerExtension', "Checker"], # "Checker found an error"
    "DDF jump around": ['ddfDefinitionProvider'],   # no extra filter needed: Definition found for ...
    "CPD aspects overview": ['aspectsExtension'],
    "Required interfaces viewer": ['ddfFileTreeExtension'],
    "Requirements editor flow": ['flowEditor'],
    "Requirements editor step": ['stepEditor'],
    "ER eventlog viewer": ['devbenchExtension', 'Spawning ssh tail process for', 'on ER/ER_event_log'],
    "Live sync": ['devbenchExtension', 'Live sync enabled for'],
    "Swipe integration": ['swipeExtension']
}

df = df.withColumn("feature_name", lit(""))

for feature in feature_dict:
    print(f'Handling feature {feature}')
    filters = feature_dict[feature]

    extension = filters[0]

    feature_name = feature
    if feature_name == 'Devbench sync old':
        feature_name = 'Devbench sync'
    elif feature_name == 'Python remote debugging old':
        feature_name = 'Python remote debugging'

    if len(filters) == 1:
        df = df.withColumn(
            "feature_name",
            when(col("logger") == extension, lit(feature_name)).otherwise(col("feature_name"))
        )
    elif len(filters) == 2:
        df = df.withColumn(
            "feature_name",
            when(
                (col("logger") == extension) & (col("message").contains(filters[1])),
                lit(feature_name)
            ).otherwise(col("feature_name"))
        )
    elif len(filters) == 3:
        df = df.withColumn(
            "feature_name",
            when((col("logger") == extension) & 
                   (col("message").contains(filters[1])) & 
                   (col("message").contains(filters[2])), lit(feature_name)).otherwise(col("feature_name"))
        )
    count = df.where(f"feature_name == '{feature_name}'").count()
    print(f"Count of {feature_name}", count)
    
display(df)

In [0]:
# Count the number of unique_days for a feature (feature_unique_days_used)
# Will be 0 if no feature is determined for the log line.
from pyspark.sql.functions import date_trunc, countDistinct

# Count the number of unique days a user for a certain CPD uses a feature
unique_days = df.groupBy("feature_name", "cpd_name", "user") \
    .agg(
        countDistinct(date_trunc("day", col("time").cast("timestamp"))).alias("feature_unique_days_used")
    ) \
    .withColumnRenamed("feature_name", "ud_feature_name") \
    .withColumnRenamed("user", "ud_user") \
    .withColumnRenamed("cpd_name", "ud_cpd_name")


display(unique_days)
display(df)

In [0]:
df = df.drop('feature_unique_days_used')

In [0]:
display(df)

In [0]:
# Join
df = df.drop('feature_unique_days_used')
df = df.join(
    unique_days, 
    (df.feature_name == unique_days.ud_feature_name) & (df.cpd_name == unique_days.ud_cpd_name) & (df.user == unique_days.ud_user), 
    how="left"
).withColumn(
    "feature_unique_days_used",
    when(
        (col("ud_feature_name").isNull()) | (col("ud_feature_name") == ""), 
        0
    ).otherwise(col("feature_unique_days_used"))
)

# Drop the name column
df = df.drop('ud_feature_name')
df = df.drop('ud_cpd_name')
df = df.drop('ud_user')

display(df)

In [0]:
# Write the cpdt_logging_silver and make sure if columns are added to the target, this is added as an optiona
df.write.format("delta").mode("overwrite").option("mergeSchema","true").saveAsTable("motar_asml.logging.cpdt_logging_silver")