In [1]:

import pandas as pd
import numpy as np
import altair as alt
from tqdm import tqdm
from general_consts import *
alt.data_transformers.enable('csv')
# alt.renderers.enable('altair_saver', fmts=['png'])

DataTransformerRegistry.enable('csv')

In [2]:
def get_df(path, table_name):
    df = pd.read_csv(fr"{path}/{table_name}.csv")
    df[CPUColumns.TIME] = pd.to_datetime(df[CPUColumns.TIME], unit='s')
    df[CPUColumns.TIME] += pd.Timedelta(hours=2)
    return df

def load_tables(path):
    cpu_df = get_df(path, TableNames.CPU)
    memory_df = get_df(path, TableNames.MEMORY)
    all_processes_df = get_df(path, TableNames.ALL_PROCESSES)
    disk_df = get_df(path, TableNames.DISK)
    splunk_logs_df = pd.read_csv(fr"{path}/output.csv")
    return cpu_df, memory_df, all_processes_df, disk_df, splunk_logs_df
    
def draw_plot(df, y, path, plot_name, save=True):
    chart = alt.Chart(df).mark_line(interpolate='linear').encode(
        alt.Y(f"{y}:Q", scale=alt.Scale(domain=[min(df[y]), max(df[y])])),
        alt.X(f"{CPUColumns.TIME}:T"),
    ).properties(
        width=2100,
        height=300
    )
    if save:
        # Create PNG image data and then write to a file
        with alt.data_transformers.enable('default'):
            chart.save(f"{path}/graphs/{plot_name}.png", engine="altair_saver")
    return chart

In [37]:
for measurment_no in range(1, 7):
    path = fr"Repositories/GreenSecurity-FirstExperiment/Dell Inc. Linux 5.15.0-67-generic/Splunk Enterprise SIEM/Power Saver Plan/One Scan/Measurement {measurment_no}"
    cpu_df, memory_df, all_processes_df, disk_df, splunk_logs_df = load_tables(path)
    cpu = draw_plot(cpu_df, CPUColumns.USED_PERCENT, path, 'cpu')
    disc_r_c = draw_plot(disk_df, DiskIOColumns.READ_COUNT, path, 'disk_read_count')
    disc_w_c = draw_plot(disk_df, DiskIOColumns.WRITE_COUNT, path, 'disk_write_count')
    disc_r_b = draw_plot(disk_df, DiskIOColumns.READ_BYTES, path, 'disk_read_bytes')
    disc_w_b = draw_plot(disk_df, DiskIOColumns.WRITE_BYTES, path, 'disk_write_bytes')
    splunk_logs_df['_time'] = pd.to_datetime(splunk_logs_df['_time'])
    splunk_logs_df['_time'] = splunk_logs_df['_time'].dt.tz_localize(None)

    # Define number of bins and opacity range
    n_bins = 200
    logs = alt.Chart(splunk_logs_df).mark_bar(opacity=0.3, color='green').encode(
        alt.Y("count():Q"),
        alt.X("_time:T", bin=alt.Bin(maxbins=n_bins)),
    ).properties(
        width=2100,
        height=300,
    )
    # Create PNG image data and then write to a file
    with alt.data_transformers.enable('default'):
        logs.save(f"{path}/graphs/logs_dist.png", engine="altair_saver")
        alt.layer(cpu,logs).resolve_scale(
        y = 'independent').save(f"{path}/graphs/cpu_logs_merge.png", engine="altair_saver")
        
        alt.layer(disc_r_c,logs).resolve_scale(
        y = 'independent').save(f"{path}/graphs/disc_r_c_logs_merge.png", engine="altair_saver")
        
        alt.layer(disc_w_c,logs).resolve_scale(
        y = 'independent').save(f"{path}/graphs/disc_w_c_logs_merge.png", engine="altair_saver")
        
        alt.layer(disc_r_b,logs).resolve_scale(
        y = 'independent').save(f"{path}/graphs/disc_r_b_logs_merge.png", engine="altair_saver")
        
        alt.layer(disc_w_b,logs).resolve_scale(
        y = 'independent').save(f"{path}/graphs/disc_w_b_logs_merge.png", engine="altair_saver")
        

In [33]:
# splunk_logs_df['_time'] = pd.to_datetime(splunk_logs_df['_time']).dt.strftime('%H:%M:%S')
# Define number of bins and opacity range
n_bins = 200
opacity_range = [0.2, 0.8]


chart = alt.Chart(splunk_logs_df).mark_bar(opacity=0.3, color='green').encode(
    alt.Y("count():Q"),
    alt.X("_time:T", bin=alt.Bin(maxbins=n_bins))
).properties(
    width=2100,
    height=300,
    # title='Bar Chart with Adjustable Opacity'
)
with alt.data_transformers.enable('default'):
    alt.layer(draw_plot(cpu_df, CPUColumns.USED_PERCENT, path, 'cpu'),chart).resolve_scale(
        y = 'independent'
    ).save(f"{path}/graphs/merge.png", engine="altair_saver")


In [3]:
path = r"C:\Users\Administrator\Repositories\GreenSecurity-FirstExperiment\Dell Inc. Linux 5.15.0-67-generic\Splunk Enterprise SIEM\Power Saver Plan\One Scan\Measurement 1"
df = get_df(path, "processes_data")

In [4]:
df

Unnamed: 0,Time(sec),PID,PNAME,CPU(%),NUM THREADS,MEMORY(MB),MEMORY(%),READ_IO(#),WRITE_IO(#),READ_IO(KB),WRITE_IO(KB)
0,2023-03-05 16:39:58.594451200,16224,splunk,0.0,1,2.477,0.02,0,0,0.0,0.0
1,2023-03-05 16:39:58.693570304,16224,splunk,0.0,1,2.559,0.02,230,2,0.0,0.0
2,2023-03-05 16:39:58.794338304,16224,splunk,0.0,1,2.559,0.02,0,0,0.0,0.0
3,2023-03-05 16:39:58.894585344,16229,splunkd,12.6,1,52.832,0.33,137,0,0.0,0.0
4,2023-03-05 16:39:58.894585344,16224,splunk,0.0,1,2.559,0.02,1,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
40178,2023-03-05 17:11:00.625762560,22097,splunk,0.0,1,2.473,0.02,0,0,0.0,0.0
40179,2023-03-05 17:11:00.741190400,22097,splunk,0.0,1,2.473,0.02,0,0,0.0,0.0
40180,2023-03-05 17:11:00.855374080,22097,splunk,0.0,1,2.473,0.02,0,0,0.0,0.0
40181,2023-03-05 17:11:00.970234624,22097,splunk,0.0,1,2.473,0.02,0,0,0.0,0.0


In [5]:
df.groupby('PID').count().to_csv('processes_data_grouped_by.csv')

In [17]:
cpu_df, memory_df, all_processes_df, disk_df, splunk_logs_df = load_tables(path)

In [14]:
pid = 16667
df_pid = df[df['PID']==pid]
df_pid.PNAME.unique()

array(['splunkd'], dtype=object)

In [15]:
alt.Chart(df_pid).mark_line(interpolate='linear').encode(
        alt.Y(f"{CPUColumns.USED_PERCENT.split(' ')[1]}:Q", scale=alt.Scale(domain=[min(df[CPUColumns.USED_PERCENT.split(' ')[1]]), max(df[CPUColumns.USED_PERCENT.split(' ')[1]])])),
        alt.X(f"{CPUColumns.TIME}:T"),
    ).properties(
        width=2100,
        height=300
    )


  for col_name, dtype in df.dtypes.iteritems():


In [16]:
df.PNAME.unique()

array(['splunk', 'splunkd'], dtype=object)

In [22]:
# splunk_logs_df['_time'] = pd.to_datetime(splunk_logs_df['_time']).dt.strftime('%H:%M:%S')
# Define number of bins and opacity range
n_bins = 200
opacity_range = [0.2, 0.8]

alt.layer(alt.Chart(df_pid).mark_line(interpolate='linear').encode(
    alt.Y(f"{CPUColumns.USED_PERCENT.split(' ')[1]}:Q", scale=alt.Scale(domain=[min(df[CPUColumns.USED_PERCENT.split(' ')[1]]), max(df[CPUColumns.USED_PERCENT.split(' ')[1]])])),
    alt.X(f"{CPUColumns.TIME}:T"),
).properties(
    width=2100,
    height=300
),alt.Chart(splunk_logs_df).mark_bar(opacity=0.3, color='green').encode(
alt.Y("count():Q"),
alt.X("_time:T", bin=alt.Bin(maxbins=n_bins))
).properties(
width=2100,
height=300,
# title='Bar Chart with Adjustable Opacity'
)).resolve_scale(
    y = 'independent'
    )


In [23]:
alt.Chart(splunk_logs_df).mark_bar(opacity=0.3, color='green').encode(
alt.Y("count():Q"),
alt.X("_time:T", bin=alt.Bin(maxbins=n_bins))
).properties(
width=2100,
height=300,
# title='Bar Chart with Adjustable Opacity'
)

In [13]:
import os
import gzip
import json
import datetime

output_file_path = "C:/Users/Administrator/Downloads/merged_file"

# Traverse through all the subfolders and extract the compressed files
extracted_files = []
for subfolder_name in os.listdir("C:/Users/Administrator/Downloads/RAWLOGS_august/RAWLOGS"):
    subfolder_path = os.path.join("C:/Users/Administrator/Downloads/RAWLOGS_august/RAWLOGS", subfolder_name)
    for subfolder2_name in os.listdir(subfolder_path):
        subfolder2_path = os.path.join(subfolder_path, subfolder2_name)
        for file_name in os.listdir(subfolder2_path):
            if file_name.endswith(".gz"):
                file_path = os.path.join(subfolder2_path, file_name)
                with gzip.open(file_path, 'rb') as f:
                    file_content = json.loads(f.read().decode('utf-8'))
                    for log in file_content["Records"]:                
                    #     dt = datetime.datetime.strptime(log['eventTime'], '%Y-%m-%dT%H:%M:%SZ')
                    #     # Convert the datetime to UTC timezone
                    #     utc_dt = dt.astimezone(pytz.utc)
                    #     # Convert the UTC datetime to UTC+3 timezone
                    #     utc3_dt = utc_dt.astimezone(pytz.timezone('Asia/Jerusalem'))
                    #     # Print the converted datetime string
                    #     log['eventTime'] = utc3_dt.strftime('%Y-%m-%d %H:%M:%S %Z%z')
                    #     # log['eventTime'] = datetime.datetime.strptime(log['eventTime'],"%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=datetime.timezone(datetime.timedelta(hours=+3), 'Jerusalem')).strftime("%Y-%m-%d %H:%M:%S %z")
                        
                        dt = datetime.datetime.strptime(log['eventTime'], '%Y-%m-%dT%H:%M:%SZ')

                        # Convert the datetime object to the desired format
                        log['eventTime'] = dt.strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
                    extracted_files += file_content["Records"]




In [16]:
# Write the merged JSON file with the independent entries structure
for j in range(8):
    with open(output_file_path+str(j)+".json", 'w') as f:
        curr = extracted_files[int(j/8*len(extracted_files)):int((j+1)/8*len(extracted_files))]
        for i, record in enumerate(curr):
            json.dump(record, f)
            if i != len(curr) - 1:
                f.write(",\n")

In [11]:
import json
import datetime
file_path  = r"C:\Program Files\Splunk\etc\apps\SA-Eventgen\samples\cloudtrail_behavioural_detections.json"
logs = []
for line in open(file_path, 'r'):
    log = json.loads(line)        
    dt = datetime.datetime.strptime(log['eventTime'], '%Y-%m-%dT%H:%M:%S.%fZ')
    # Convert the datetime object to the desired format
    log['eventTime'] = dt.strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
    logs.append(log)
with open(r"C:\Program Files\Splunk\etc\apps\SA-Eventgen\samples\cloudtrail_behavioural_detections_2.json", 'w') as f:
    for i, record in enumerate(logs):
            json.dump(record, f)
            if i != len(logs) - 1:
                f.write(",\n")