# Que 17. On which date was the latest fatal kernel error resulting in an rts panic?

basic steps followed to answer the questions were reffered from
https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html

In [1]:
#importing necessary libraries and datatypes

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DateType, TimestampType, IntegerType
from pyspark.sql.functions import *
import time

#setting up a spark session

session = SparkSession.builder.appName("BGLlog").getOrCreate()

23/08/05 23:04:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
# start time of the program
start_program_time= time.time()

# Schema Design

In [3]:
# defined the schema for the BGL log data

BGL_schema = StructType([

    StructField("AlertFlag", StringType(),True),
    
    # first Timestamp is assigned to IntegerType datatype and later to TimestampType datatype, 
    # as initializing Timestamp to "TimestampType" datatype in the beginning was not possible
    StructField("Timestamp", IntegerType(),True), 

    StructField("Date", StringType(),True),

    StructField("Node", StringType(),True),

    StructField("date_and_time", TimestampType(),True),

    StructField("Node_Rep", StringType(),True),

    StructField("Message_Type", StringType(),True),

    StructField("SysComp", StringType(),True),

    StructField("Level", StringType(),True),

    StructField("Message_Content", StringType(),True)

])

In [4]:
# read the BGL log data from the CSV file with the specified schema (i.e. BGL_schema) 
# and date/timestamp formats are specified

BGL_df=session.read.csv(
    "BGLnew.log",
    schema = BGL_schema,
    dateFormat = "yyyy.MM.dd", 
    timestampFormat = "yyyy-MM-dd-HH.mm.ss.SSSSSS", 
    )

In [5]:
# cast the Timestamp coloumn to proper TimestampType 

BGL_df = BGL_df.withColumn("Timestamp", BGL_df["Timestamp"].cast(TimestampType()))

In [6]:
# print the schema to confirm the schema update of the Dataframe

BGL_df.printSchema()

root
 |-- AlertFlag: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- Date: string (nullable = true)
 |-- Node: string (nullable = true)
 |-- date_and_time: timestamp (nullable = true)
 |-- Node_Rep: string (nullable = true)
 |-- Message_Type: string (nullable = true)
 |-- SysComp: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Message_Content: string (nullable = true)



# Transformation on BGL dataframe

In [7]:
# filter the BGL dataframe to get rows with FATAL log from coloumn Level, 
# KERNEL from coloumn System Component, and containing "rts panic" in the coloumn Message Content

rts_panic_errors = BGL_df.filter((col("Level") == "FATAL") & 
                             (col("SysComp").contains("KERNEL")) & 
                             (col("Message_Content").contains("rts panic")))

# Action on the filtered dataframe

In [8]:
# select the "Date" column from the filtered DataFrame, sort it in descending order, and get the latest date (top row)

latest_rts_panic_date = rts_panic_errors.select("Date").orderBy(col("Date").desc()).limit(1)

In [9]:
# show the resulting DataFrame containing the latest date with "rts panic" errors

latest_rts_panic_date.show()



+----------+
|      Date|
+----------+
|2006.01.03|
+----------+



                                                                                

# Execution Time of the Program

In [10]:
# end time of the program
end_program_time= time.time()

# calculate executation time
end_program_time-start_program_time

5.566801309585571