<a href="https://colab.research.google.com/github/sko9370/rootCauseAnalysis/blob/main/Individual_Logs_Starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# download and uncompress the logs
!wget https://github.com/sko9370/rootCauseAnalysis/raw/main/logs.zip
!unzip logs.zip
!rm logs.zip

In [None]:
# install package to be able to load in json logs
!pip install pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.caseSensitive", "true")

In [6]:
# ingest sysmon.json log
sysmon_df = spark.read.json("logs/sysmon.json")
sysmon_df.createTempView("sysmon")

# ingest powershell_operational.json log
psop_df = spark.read.json("logs/powershell_operational.json")
psop_df.createTempView("psop")

In [None]:
# run a sql query on previously ingested "firewall" log. use the name from the .createTempView method
# DESCRIBE is a sql function that lists all the columns and the type of data they use
psop = spark.sql(
'''
DESCRIBE psop
'''
)
# .show(<number of rows to show>, <truncate means to cut off columns to make them fit on the screen)
psop.show(100, truncate = False)

In [None]:
# same thing with sysmon here
sysmon = spark.sql(
'''
DESCRIBE sysmon
'''
)
# new option here, vertical lists the columns vertically instead of horizontally like the above
# this is useful for fitting log entries with MANY columns on one screen
sysmon.show(100, truncate = False, vertical = True)

In [None]:
# a more complex query that counts the number of each EventID in the sysmon log and orders them in descending (DESC) order
sysmon = spark.sql(
'''
SELECT EventID, COUNT(EventID) AS MOST_FREQUENT
FROM sysmon
GROUP BY EventID
ORDER BY COUNT(EventID) DESC
'''
)
sysmon.show(100, truncate = False)

In [None]:
# iterates through each column and counts the occurance of each unique entry and orders in descending order
# notice this does not use the temporary view (sql) and instead references the data directly through the dataframe (sysmon_df)
most_frequent = []
for col in sysmon_df.columns:
    most_frequent.append(sysmon_df.groupBy(col).count().orderBy('count',ascending = False))

In [None]:
# this actually prints out the top 5 most frequent entries for each column starting with the 5th column (previous ones were mostly unique or uninteresting)
# this will help you determine which columns are useful to actually query on or investigate
for col in most_frequent[5:]:
    col.show(5, truncate = False)

In [None]:
# yet another method to query using a more code-like syntax if you are unfamiliar with sql. very similar keywords however
# also note that this method uses the dataframe (sysmon_df) directly instead of the temporary view (sql)
sysmonEid1 = sysmon_df.filter("EventID == 1").groupBy('Image').count().orderBy('count',ascending = False).show(30,truncate = False)