In [1]:

import pandas as pd
from tensorflow.python.summary.summary_iterator import summary_iterator
from collections import defaultdict
import glob
import os

def extract_security_runs(base_dir):
    """
    Extract TensorBoard data from security-related runs into a pandas DataFrame.
    
    Args:
        base_dir (str): Base directory containing run directories
        
    Returns:
        pd.DataFrame: DataFrame containing steps and values for each run
    """
    # Dictionary to store data
    data = defaultdict(lambda: defaultdict(list))
    
    # Get all subdirectories that match the pattern
    run_dirs = glob.glob(os.path.join(base_dir, "train_*"))
    run_dirs.append(base_dir)
    
    for run_dir in run_dirs:
        run_name = os.path.basename(run_dir)
        
        # Find event files in this directory
        event_files = glob.glob(os.path.join(run_dir, "events.out.tfevents.*"))
        
        for event_file in event_files:
            try:
                for event in summary_iterator(event_file):
                    if not event.summary.value:
                        continue
                    
                    step = event.step
                    wall_time = event.wall_time
                    
                    for value in event.summary.value:
                        # Extract the tag and handle different value types
                        tag = value.tag
                        if hasattr(value, 'simple_value'):
                            val = value.simple_value
                        else:
                            continue
                        
                        # Store data
                        data[run_name]['step'].append(step)
                        data[run_name]['value'].append(val)
                        data[run_name]['wall_time'].append(wall_time)
                        data[run_name]['tag'].append(tag)
                        
            except Exception as e:
                print(f"Error reading {event_file}: {e}")
                continue
    
    # Convert to DataFrame
    dfs = []
    for run_name, values in data.items():
        if values['step']:  # Check if we have any data
            df = pd.DataFrame({
                'step': values['step'],
                'value': values['value'],
                'wall_time': values['wall_time'],
                'tag': values['tag']
            })
            df['run_name'] = run_name
            
            # Extract metadata from run_name
            if 'episodic_policy' in run_name:
                # Parse episodic policy runs
                parts = run_name.split('_')
                df['run_type'] = 'episodic_policy'
                df['log_type'] = parts[3] if len(parts) > 3 else ''
                df['event_id'] = parts[4] if len(parts) > 4 else ''
                df['sub_id'] = parts[5] if len(parts) > 5 else ''
            elif 'rules' in run_name:
                # Parse rules-related runs
                df['run_type'] = 'rules'
                # Extract rule type (alert, duration, etc.)
                if 'alert' in run_name:
                    df['rule_type'] = 'alert'
                elif 'duration' in run_name:
                    df['rule_type'] = 'duration'
                else:
                    df['rule_type'] = 'other'
                # Clean and store the rule description
                rule_desc = '_'.join(run_name.split('_')[3:])
                df['rule_description'] = rule_desc
            else:
                df['run_type'] = 'other'
                df['log_type'] = ''
            
            dfs.append(df)
    
    if not dfs:
        return pd.DataFrame()
    
    # Combine all data
    final_df = pd.concat(dfs, ignore_index=True)
    
    # Add timestamp column in readable format
    final_df['timestamp'] = pd.to_datetime(final_df['wall_time'], unit='s')
    
    # Reorder columns for better readability
    columns = ['run_name', 'run_type', 'log_type', 'event_id', 'sub_id', 
              'rule_type', 'rule_description', 'step', 'value', 
              'timestamp', 'wall_time', 'tag']
    
    final_df = final_df[columns]
    return final_df

def analyze_security_runs(df):
    """
    Generate summary analysis of security runs.
    
    Args:
        df (pd.DataFrame): DataFrame from extract_security_runs
        
    Returns:
        dict: Summary statistics and analysis
    """
    summary = {
        'total_runs': df['run_name'].nunique(),
        'episodic_policy_runs': df[df['run_type'] == 'episodic_policy']['run_name'].nunique(),
        'rules_runs': df[df['run_type'] == 'rules']['run_name'].nunique(),
        'unique_log_types': df['log_type'].unique().tolist(),
        'unique_event_ids': df['event_id'].unique().tolist(),
        'time_range': {
            'start': df['timestamp'].min(),
            'end': df['timestamp'].max()
        },
        'max_steps': df['step'].max()
    }
    
    return summary

def plot_security_runs(df, run_type=None, metric=None, smooth_factor=0.6):
    """
    Plot security run data with TensorBoard-like visualization.
    
    Args:
        df (pd.DataFrame): DataFrame from extract_security_runs
        run_type (str, optional): Filter by run type ('episodic_policy' or 'rules')
        metric (str, optional): Filter by specific metric/tag
        smooth_factor (float): Smoothing factor for the plot (0 to 1)
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # Filter data
    plot_df = df.copy()
    if run_type:
        plot_df = plot_df[plot_df['run_type'] == run_type]
    if metric:
        plot_df = plot_df[plot_df['tag'] == metric]
    
    # Create figure
    plt.figure(figsize=(15, 8))
    
    # Plot each run
    for run_name in plot_df['run_name'].unique():
        run_data = plot_df[plot_df['run_name'] == run_name].sort_values('step')
        
        # Apply smoothing
        if smooth_factor > 0:
            smooth_value = run_data['value'].ewm(alpha=(1 - smooth_factor)).mean()
        else:
            smooth_value = run_data['value']
        
        plt.plot(run_data['step'], smooth_value, label=run_name, alpha=0.8)
    
    plt.grid(True, alpha=0.3)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title(f'Security Runs: {run_type if run_type else "All"} - {metric if metric else "All Metrics"}')
    plt.xlabel('Step')
    plt.ylabel('Value')
    plt.tight_layout()
    plt.show()


# Replace with your TensorBoard log directory
base_dir = "/home/shouei/GreenSecurity-FirstExperiment/SplunkResearch/experiments_____/train/tensorboard/train_20241201_142030_1"

# Extract data
df = extract_security_runs(base_dir)
df.to_csv('metrics.csv')
# Get summary statistics
summary = analyze_security_runs(df)
print("\nSummary:")
for key, value in summary.items():
    print(f"{key}: {value}")

# # Plot different views of the data
# plot_security_runs(df, run_type='episodic_policy')

# plot_security_runs(df, run_type='rules')

2024-12-02 17:15:29.346015: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-02 17:15:31.344986: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-02 17:15:31.346438: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`

Summary:
total_runs: 191
episodic_policy_runs: 15
rules_runs: 175
unique_log_types: ["('wineventlog:security', '4663')", "('wineventlog:security', '4732')", "('wineventlog:security', '4769')", "('wineventlog:security', '5140')", "('wineventlog:system', '7036')", "('wineventlog:system', '7040')", "('wineventlog:system', '7045')", "('wineventlog:security', '4624')", nan, '']
unique_event_ids: ['0', '1', nan]
time_range: {'start': Timestamp('2024-12-01 12:22:55.118367232'), 'end': Timestamp('2024-12-02 07:24:21.137732096')}
max_steps: 43110


In [1]:
path = "/home/shouei/GreenSecurity-FirstExperiment/experiments/logs/evaluations.npz"
import numpy as np
data = np.load(path)
data.files

['timesteps', 'results', 'ep_lengths']

In [3]:
data['results'].shape

(94, 5)

In [2]:
episodic_policy = df[df['run_type'] == 'episodic_policy']
duration_gap = df[df['tag'] == 'train/duration_gap']

In [3]:
duration_gap

Unnamed: 0,run_name,run_type,log_type,event_id,sub_id,rule_type,rule_description,step,value,timestamp,wall_time,tag
657301,train_20241201_142030_1,other,,,,,,1944,11.183286,2024-12-01 12:29:52.663003904,1.733056e+09,train/duration_gap
657332,train_20241201_142030_1,other,,,,,,1945,11.183286,2024-12-01 12:29:52.849863424,1.733056e+09,train/duration_gap
657363,train_20241201_142030_1,other,,,,,,1946,11.183286,2024-12-01 12:29:52.904667648,1.733056e+09,train/duration_gap
657394,train_20241201_142030_1,other,,,,,,1947,11.183286,2024-12-01 12:29:52.959293184,1.733056e+09,train/duration_gap
657425,train_20241201_142030_1,other,,,,,,1948,11.183286,2024-12-01 12:29:53.019957248,1.733056e+09,train/duration_gap
...,...,...,...,...,...,...,...,...,...,...,...,...
797772,train_20241201_142030_1,other,,,,,,43064,-0.132429,2024-12-02 07:20:22.583154176,1.733124e+09,train/duration_gap
797804,train_20241201_142030_1,other,,,,,,43066,-0.387286,2024-12-02 07:24:20.463991040,1.733124e+09,train/duration_gap
797841,train_20241201_142030_1,other,,,,,,43073,-0.387286,2024-12-02 07:24:20.588971776,1.733124e+09,train/duration_gap
797894,train_20241201_142030_1,other,,,,,,43096,-0.387286,2024-12-02 07:24:20.901679104,1.733124e+09,train/duration_gap


In [4]:
pivot = episodic_policy.pivot(index='step', columns='run_name', values='value')
# pivot.reset_index(inplace=True)
# pivot = pivot.drop(columns='run_name')

In [5]:
new_df = pd.merge(pivot.reset_index(), duration_gap[['step','value']], on='step', how='left')
new_df = new_df.rename(columns={'value':'duration_gap'})
new_df = new_df.dropna()
new_df

Unnamed: 0,step,"train_episodic_policy_('wineventlog:security', '4624')_0","train_episodic_policy_('wineventlog:security', '4663')_0","train_episodic_policy_('wineventlog:security', '4663')_1","train_episodic_policy_('wineventlog:security', '4732')_0","train_episodic_policy_('wineventlog:security', '4732')_1","train_episodic_policy_('wineventlog:security', '4769')_0","train_episodic_policy_('wineventlog:security', '4769')_1","train_episodic_policy_('wineventlog:security', '5140')_0","train_episodic_policy_('wineventlog:security', '5140')_1","train_episodic_policy_('wineventlog:system', '7036')_0","train_episodic_policy_('wineventlog:system', '7036')_1","train_episodic_policy_('wineventlog:system', '7040')_0","train_episodic_policy_('wineventlog:system', '7040')_1","train_episodic_policy_('wineventlog:system', '7045')_0","train_episodic_policy_('wineventlog:system', '7045')_1",duration_gap
1912,1944,0.0,0.0,0.0,0.0,0.0,43200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.183286
1913,1945,0.0,43200.0,37070.0,0.0,41447.0,43200.0,5687.0,0.0,11257.0,0.0,0.0,0.0,0.0,27093.0,0.0,11.183286
1914,1946,0.0,25237.0,23863.0,0.0,0.0,43200.0,0.0,0.0,0.0,69.0,0.0,0.0,0.0,0.0,0.0,11.183286
1915,1947,35996.0,0.0,0.0,0.0,0.0,37547.0,0.0,19658.0,7759.0,0.0,35469.0,0.0,0.0,0.0,15674.0,11.183286
1916,1948,0.0,5424.0,2114.0,0.0,0.0,0.0,39758.0,0.0,23652.0,22669.0,0.0,9195.0,0.0,0.0,18991.0,11.183286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5195,43064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51170.0,0.0,0.0,0.0,-0.132429
5196,43066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43200.0,0.0,0.0,0.0,-0.387286
5197,43073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12350.0,0.0,32099.0,0.0,-0.387286
5198,43096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70271.0,0.0,0.0,0.0,-0.387286


In [21]:
new_df[new_df['duration_gap'] < 0][['step','duration_gap']]

Unnamed: 0,step,duration_gap
2269,2346,-0.677000
2270,2347,-0.677000
2271,2348,-0.677000
2272,2350,-0.677000
2273,2351,-0.677000
...,...,...
5195,43064,-0.132429
5196,43066,-0.387286
5197,43073,-0.387286
5198,43096,-0.387286


In [25]:
final_df = new_df.melt(id_vars='step', value_vars=new_df.columns[1:-1]).assign(duration_gap=lambda x: x['step'].map(new_df['duration_gap']))
final_df = final_df[final_df['duration_gap'] < 0]

In [31]:
final_df.groupby('variable').mean().sort_values('value')

Unnamed: 0_level_0,step,value,duration_gap
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"train_episodic_policy_('wineventlog:system', '7045')_1",3809.238739,1174.157658,-0.448653
"train_episodic_policy_('wineventlog:security', '5140')_1",3809.238739,1653.184685,-0.448653
"train_episodic_policy_('wineventlog:security', '4663')_1",3809.238739,1883.315315,-0.448653
"train_episodic_policy_('wineventlog:security', '4769')_1",3809.238739,2274.932432,-0.448653
"train_episodic_policy_('wineventlog:system', '7040')_1",3809.238739,2303.954955,-0.448653
"train_episodic_policy_('wineventlog:security', '4769')_0",3809.238739,2350.842342,-0.448653
"train_episodic_policy_('wineventlog:security', '4624')_0",3809.238739,2558.301802,-0.448653
"train_episodic_policy_('wineventlog:security', '5140')_0",3809.238739,2700.806306,-0.448653
"train_episodic_policy_('wineventlog:system', '7045')_0",3809.238739,3235.099099,-0.448653
"train_episodic_policy_('wineventlog:system', '7036')_0",3809.238739,3249.747748,-0.448653


In [34]:
# plot using express
import plotly.express as px
fig = px.line(final_df.groupby(['step','variable']).mean().reset_index(), x='step', y='value', color='variable', title='Episodic Policy Runs with Negative Duration Gap')

In [35]:
fig

In [3]:
sub_df = df[['rule_description', 'log_type', 'tag', 'value', 'event_id','step']]
sub_df = sub_df[sub_df['tag'].isin(['train/episodic_policy','train/rules_cpu_gap','train/rules_duration_gap'])]
sub_df['rule_description'] = sub_df['rule_description'].str.replace('train_rules_cpu_gap_','')
sub_df['rule_description'] = sub_df['rule_description'].str.replace('train_rules_duration_gap_','')
sub_df['rule_description'] = sub_df['rule_description'].str.replace('gap_','')
policy_df = sub_df[sub_df['tag'] == 'train/episodic_policy'].copy()
policy_df.drop(columns=['rule_description', 'tag'], inplace=True)
policy_df.rename(columns={'value': 'policy_value'}, inplace=True)
metrics_df = sub_df[sub_df['tag'] != 'train/episodic_policy'].copy()
metrics_df.drop(columns=['event_id'], inplace=True)
del sub_df

logtypes_mapping = {"Kerberoasting spn request with RC4 encryption": "('wineventlog:security', '4769')",
                    "Clop Ransomware Known Service Name": "('wineventlog:system', '7045')",
                    'ESCU Network Share Discovery Via Dir Command Rule':"('wineventlog:security', '5140')"}
metrics_df['log_type'] = metrics_df['rule_description'].map(logtypes_mapping)
final_df = pd.merge(metrics_df, policy_df, on=['log_type', 'step'])
final_df

Unnamed: 0,rule_description,log_type,tag,value,step,policy_value,event_id
0,Clop Ransomware Known Service Name,"('wineventlog:system', '7045')",train/rules_cpu_gap,3.088571,1944,0.0,0
1,Clop Ransomware Known Service Name,"('wineventlog:system', '7045')",train/rules_cpu_gap,3.088571,1944,0.0,1
2,Clop Ransomware Known Service Name,"('wineventlog:system', '7045')",train/rules_duration_gap,1.036714,1944,0.0,0
3,Clop Ransomware Known Service Name,"('wineventlog:system', '7045')",train/rules_duration_gap,1.036714,1944,0.0,1
4,Clop Ransomware Known Service Name,"('wineventlog:system', '7045')",train/rules_cpu_gap,3.088571,1945,27093.0,0
...,...,...,...,...,...,...,...
39451,Kerberoasting spn request with RC4 encryption,"('wineventlog:security', '4769')",train/rules_duration_gap,-0.205571,43096,0.0,1
39452,Kerberoasting spn request with RC4 encryption,"('wineventlog:security', '4769')",train/rules_cpu_gap,1.969286,43109,0.0,0
39453,Kerberoasting spn request with RC4 encryption,"('wineventlog:security', '4769')",train/rules_cpu_gap,1.969286,43109,0.0,1
39454,Kerberoasting spn request with RC4 encryption,"('wineventlog:security', '4769')",train/rules_duration_gap,-0.205571,43109,0.0,0


In [4]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

def analyze_log_type_influence(df):
    """
    Analyze how the amount of each log type influences its rule metrics
    """
    # Group unique rule descriptions
    rule_descriptions = df['rule_description'].unique()
    
    # Create figure with subplots - one for each rule
    fig = make_subplots(
        rows=len(rule_descriptions),
        cols=1,
        subplot_titles=[f"{desc}" for desc in rule_descriptions],
        vertical_spacing=0.3
    )
    
    # Define colors for different metrics
    colors = {
        'train/rules_cpu_gap': '#1f77b4',      # Blue for CPU gap
        'train/rules_duration_gap': '#ff7f0e'   # Orange for duration gap
    }
    
    # Process each rule
    for idx, rule in enumerate(rule_descriptions, 1):
        rule_data = df[df['rule_description'] == rule].copy()
        
        # Add cumulative count for this rule
        rule_data['cumulative_count'] = range(len(rule_data))
        
        # Plot each metric type
        for tag in rule_data['tag'].unique():
            metric_data = rule_data[rule_data['tag'] == tag].copy()
            
            # Add the trace
            fig.add_trace(
                go.Scatter(
                    x=metric_data['cumulative_count'],
                    y=metric_data['value'],
                    mode='markers+lines',
                    name=tag.split('/')[-1],
                    marker=dict(
                        symbol='circle' if metric_data['event_id'].iloc[0] == 0 else 'x',
                        size=8,
                        color=colors[tag]
                    ),
                    line=dict(color=colors[tag]),
                    legendgroup=tag,
                    showlegend=(idx == 1),
                    hovertemplate=(
                        "Count: %{x}<br>" +
                        "Value: %{y:.4f}<br>" +
                        "Event ID: %{customdata[0]}<br>" +
                        "Policy Value: %{customdata[1]}<br>" +
                        "<extra></extra>"
                    ),
                    customdata=list(zip(metric_data['event_id'], metric_data['policy_value']))
                ),
                row=idx,
                col=1
            )
            
            # Add trend line
            z = np.polyfit(metric_data['cumulative_count'], metric_data['value'], 1)
            p = np.poly1d(z)
            
            fig.add_trace(
                go.Scatter(
                    x=metric_data['cumulative_count'],
                    y=p(metric_data['cumulative_count']),
                    mode='lines',
                    line=dict(dash='dash', color=colors[tag]),
                    name=f'Trend {tag.split("/")[-1]}',
                    showlegend=False
                ),
                row=idx,
                col=1
            )

        # Calculate correlation between count and metrics
        correlations = []
        for tag in rule_data['tag'].unique():
            metric_data = rule_data[rule_data['tag'] == tag]
            corr = np.corrcoef(metric_data['cumulative_count'], metric_data['value'])[0,1]
            correlations.append(f"{tag.split('/')[-1]}: {corr:.3f}")

        # Add correlation annotation
        fig.add_annotation(
            text=f"Correlations:<br>" + "<br>".join(correlations),
            xref="x domain",
            yref="y domain",
            x=1.1,
            y=0.9,
            showarrow=False,
            row=idx,
            col=1
        )

        # Update axes labels
        fig.update_xaxes(
            title_text="Cumulative Count of Log Type",
            row=idx,
            col=1,
            showgrid=True
        )
        fig.update_yaxes(
            title_text="Metric Value",
            row=idx,
            col=1,
            showgrid=True
        )

    # Update layout
    fig.update_layout(
        height=400 * len(rule_descriptions),
        title="Impact of Log Type Amount on Rule Metrics",
        showlegend=True,
        legend=dict(
            title="Metrics",
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=1.05
        ),
        margin=dict(r=250, t=100, b=50)  # Increased right margin for correlations
    )

    return fig

def print_metric_statistics(df):
    """Print detailed statistics about how log amounts affect metrics"""
    print("\nDetailed Metric Analysis:")
    print("-" * 50)
    
    for rule in df['rule_description'].unique():
        print(f"\nRule: {rule}")
        rule_data = df[df['rule_description'] == rule]
        
        print("\nMetric Statistics:")
        for tag in rule_data['tag'].unique():
            metric_data = rule_data[rule_data['tag'] == tag]
            print(f"\n{tag.split('/')[-1]}:")
            print(f"  Average value: {metric_data['value'].mean():.4f}")
            print(f"  Standard deviation: {metric_data['value'].std():.4f}")
            print(f"  Min value: {metric_data['value'].min():.4f}")
            print(f"  Max value: {metric_data['value'].max():.4f}")
            
            # Calculate rate of change
            if len(metric_data) > 1:
                value_changes = metric_data['value'].diff()
                print(f"  Average rate of change: {value_changes.mean():.4f}")
                print(f"  Median rate of change: {value_changes.median():.4f}")

if __name__ == "__main__":
    # Your DataFrame is already loaded as df
    
    # Create visualization
    fig = analyze_log_type_influence(final_df)
    
    # Save visualization
    fig.write_html('log_type_influence.html')
    
    # Print statistical analysis
    print_metric_statistics(final_df)




Detailed Metric Analysis:
--------------------------------------------------

Rule: Clop Ransomware Known Service Name

Metric Statistics:

rules_cpu_gap:
  Average value: -0.6520
  Standard deviation: 9.8235
  Min value: -21.5114
  Max value: 120.5543
  Average rate of change: -0.0015
  Median rate of change: 0.0000

rules_duration_gap:
  Average value: 0.1769
  Standard deviation: 0.4164
  Min value: -0.4944
  Max value: 2.6926
  Average rate of change: -0.0002
  Median rate of change: 0.0000

Rule: ESCU Network Share Discovery Via Dir Command Rule

Metric Statistics:

rules_cpu_gap:
  Average value: 2.1199
  Standard deviation: 14.0521
  Min value: -33.0443
  Max value: 125.7514
  Average rate of change: -0.0002
  Median rate of change: 0.0000

rules_duration_gap:
  Average value: 0.2105
  Standard deviation: 0.4789
  Min value: -1.0383
  Max value: 2.8616
  Average rate of change: -0.0002
  Median rate of change: 0.0000

Rule: Kerberoasting spn request with RC4 encryption

Metric 

In [7]:
df


Unnamed: 0,run_name,run_type,log_type,event_id,sub_id,rule_type,rule_description,step,value,timestamp,wall_time,tag
0,"train_episodic_policy_('wineventlog:security',...",episodic_policy,"('wineventlog:security', '4663')",0,,,,1,0.0,2024-12-01 12:22:55.118574848,1.733056e+09,train/episodic_policy
1,"train_episodic_policy_('wineventlog:security',...",episodic_policy,"('wineventlog:security', '4663')",0,,,,2,22830.0,2024-12-01 12:22:55.146232064,1.733056e+09,train/episodic_policy
2,"train_episodic_policy_('wineventlog:security',...",episodic_policy,"('wineventlog:security', '4663')",0,,,,3,29143.0,2024-12-01 12:22:55.163619840,1.733056e+09,train/episodic_policy
3,"train_episodic_policy_('wineventlog:security',...",episodic_policy,"('wineventlog:security', '4663')",0,,,,4,43200.0,2024-12-01 12:22:55.182142976,1.733056e+09,train/episodic_policy
4,"train_episodic_policy_('wineventlog:security',...",episodic_policy,"('wineventlog:security', '4663')",0,,,,5,0.0,2024-12-01 12:22:55.202255872,1.733056e+09,train/episodic_policy
...,...,...,...,...,...,...,...,...,...,...,...,...
653395,train_rules_write_count_gap_Windows Event For ...,rules,,,,other,count_gap_Windows Event For Service Disabled,43064,911.0,2024-12-02 07:20:22.598116096,1.733124e+09,train/rules_write_count_gap
653396,train_rules_write_count_gap_Windows Event For ...,rules,,,,other,count_gap_Windows Event For Service Disabled,43066,-989.0,2024-12-02 07:24:20.473052160,1.733124e+09,train/rules_write_count_gap
653397,train_rules_write_count_gap_Windows Event For ...,rules,,,,other,count_gap_Windows Event For Service Disabled,43073,-989.0,2024-12-02 07:24:20.597249024,1.733124e+09,train/rules_write_count_gap
653398,train_rules_write_count_gap_Windows Event For ...,rules,,,,other,count_gap_Windows Event For Service Disabled,43096,-989.0,2024-12-02 07:24:20.915096064,1.733124e+09,train/rules_write_count_gap


In [60]:
gap_df.groupby('num_rules').mean(numeric_only=True).reset_index()

Unnamed: 0,num_rules,wall_time,step,value,pct
0,3,1730699000.0,1000.0,-0.251,11.320137
1,4,1730829000.0,951.49,0.37337,6.332573
2,5,1730910000.0,951.49,0.084277,6.382544
3,6,1730954000.0,1379.990596,1.129935,16.825529


In [61]:
gap_df = pd.read_csv(f'/home/shouei/GreenSecurity-FirstExperiment/SplunkResearch/best_experiments/processed_tesnorboard/duration_gap.csv')
after_duration_df = pd.read_csv(f'/home/shouei/GreenSecurity-FirstExperiment/SplunkResearch/best_experiments/processed_tesnorboard/duration.csv')
env_name_to_rules_num = {"splunk_train-v1":3, "splunk_train-v5":4, "splunk_train-v7":5,"splunk_train-v8":6}
# compute the before duration
before_duration_df = after_duration_df.copy()
before_duration_df['value'] = after_duration_df['value'] - gap_df['value']
# chart of the gap in percantage   
gap_df['pct'] = (gap_df['value'] / before_duration_df['value']) * 100
gap_df['pct'] = gap_df['pct'].replace([np.inf, -np.inf], np.nan)
gap_df = gap_df[gap_df['step']>900]
gap_df['num_rules'] = gap_df['env_name'].map(env_name_to_rules_num)
# smooth the data
gap_df['pct'] = gap_df['pct'].rolling(window=100).mean()
gap_df.dropna(inplace=True)
agg_gap_df = gap_df.groupby('num_rules').mean(numeric_only=True).reset_index()
# drow the chart
fig = px.bar(agg_gap_df, y='pct', x='num_rules', title='Gap in percentage', labels={'pct': 'Gap in percentage', 'env_name': 'Environment name'})
fig.show()

In [73]:
# make the line chart of the duration of the before and after for each environment
before_duration_df['num_rules'] = before_duration_df['env_name'].map(env_name_to_rules_num)
after_duration_df['num_rules'] = after_duration_df['env_name'].map(env_name_to_rules_num)
before_duration_df['type'] = 'before'
after_duration_df['type'] = 'after'
df = pd.concat([before_duration_df, after_duration_df])
# df = df[df['step']>900]
for num_rules in df['num_rules'].unique():
    # df.loc[df['num_rules'] == num_rules, 'value'] = df.loc[df['num_rules'] == num_rules, 'value'].rolling(window=100).mean()
    fig = px.line(df[df['num_rules']==num_rules], x='step', y='value', color='type', title=f'Duration of the before and after for {num_rules} rules', labels={'value': 'Duration', 'step': 'Step'})
    fig.show()

In [6]:
for additional_percentage in [0.2]:
    df = pd.read_csv(f'/home/shouei/GreenSecurity-FirstExperiment/SplunkResearch/experiments_____/processed_tesnorboard/duration_gap.csv')
    df = df[df['additional_percentage'] == additional_percentage]
    # df.loc[df['policy'] == 'mlp','policy'] = df.loc[df['policy'] == 'mlp','reward_calculator_version'].apply(lambda x: f"{x}_agent")
    df = df.drop(columns=['run_name']).groupby('policy').mean().reset_index()
    df[['event_code', 'is_triggered']] = np.zeros((len(df), 2))
    df[['event_code', 'is_triggered']] = df['policy'].str.split('_', expand=True)
    bar_fig = px.bar(df, x='policy', y='value', title=f'Duration gap by policy (additional percentage: {additional_percentage})', color='event_code', color_discrete_sequence=px.colors.qualitative.Alphabet)
    bar_fig.show()
    bar_fig = px.bar(df, x='policy', y='value', title='Duration gap by policy', color='is_triggered', color_discrete_sequence=px.colors.qualitative.Alphabet)
    bar_fig.show()


In [15]:
df.sort_values('value', ascending=False)

Unnamed: 0,policy,wall_time,step,value,additional_percentage,reward_calculator_version,event_code,is_triggered
17,equal_1,1728437000.0,20.0,2.595394,0.2,16.0,equal,1
6,4769_0,1728422000.0,20.0,2.478072,0.2,16.0,4769,0
2,4663_0,1728417000.0,20.0,2.356892,0.2,16.0,4663,0
5,4732_1,1728421000.0,20.0,2.346982,0.2,16.0,4732,1
7,4769_1,1728423000.0,20.0,2.304006,0.2,16.0,4769,1
3,4663_1,1728418000.0,20.0,1.949312,0.2,16.0,4663,1
16,equal_0,1728436000.0,20.0,1.901548,0.2,16.0,equal,0
9,5140_1,1728426000.0,20.0,1.832197,0.2,16.0,5140,1
15,7045_1,1728433000.0,20.0,1.592042,0.2,16.0,7045,1
8,5140_0,1728424000.0,20.0,1.358805,0.2,16.0,5140,0


In [7]:
for additional_percentage in [0.2, 1]:
    df = pd.read_csv('/home/shouei/GreenSecurity-FirstExperiment/SplunkResearch/experiments_____/processed_tesnorboard/p_values.csv')
    df = df[df['additional_percentage'] == additional_percentage]
    df = df.drop(columns=['run_name']).groupby('policy').mean().reset_index()
    df[['event_code', 'is_triggered']] = df['policy'].str.split('_', expand=True)
    bar_fig = px.bar(df, x='policy', y='value', title=f'P values by policy (additional percentage: {additional_percentage})', color='event_code', color_discrete_sequence=px.colors.qualitative.Alphabet)
    bar_fig.show()
    bar_fig = px.bar(df, x='policy', y='value', title='P values by policy', color='is_triggered', color_discrete_sequence=px.colors.qualitative.Alphabet)
    bar_fig.show()

In [60]:
import os
import splunklib.client as client
import splunklib.results as results
import time
from datetime import datetime, timedelta
from dotenv import load_dotenv
import psutil

def monitor_search_resources(host, port, username, password, search_query, duration_minutes):
    # Connect to Splunk
    service = client.connect(
        host=host,
        port=port,
        username=username,
        password=password
    )

    # Start the search job
    job = service.jobs.create(search_query)

    start_time = datetime.now()
    end_time = start_time + timedelta(minutes=duration_minutes)

    while datetime.now() < end_time:
        # Get the latest statistics
        job.refresh()
        stats = job.content
        
        # Extract relevant resource metrics
        scan_count = stats.get('scanCount', 0)
        event_count = stats.get('eventCount', 0)
        result_count = stats.get('resultCount', 0)
        disk_usage = stats.get('diskUsage', 0)
        run_duration = stats.get('runDuration', 0)
        pid = int(stats.get('pid', 0))
        print('pid: ', pid)
        if pid != 0:
            try:
                process = psutil.Process(pid)
                with process.oneshot():
                    cpu_percent = process.cpu_percent(interval=1)
                    cpu_num = psutil.cpu_count()
                    cpu_times = process.cpu_times()
                    memory_info = process.memory_info()
                    io_counters = process.io_counters()
            except psutil.NoSuchProcess:
                print(f"Process with PID {pid} not found.")
                return None
            except psutil.AccessDenied:
                print(f"Access denied to process with PID {pid}.")
                return None
        else:
            cpu_percent = 0
            cpu_num = 0
            cpu_times = 0
            memory_info = 0
            io_counters = 0

        
        # Print or store the metrics as needed
        print(f"Timestamp: {datetime.now()}")
        print(f"Scan Count: {scan_count}")
        print(f"Event Count: {event_count}")
        print(f"Result Count: {result_count}")
        print(f"Disk Usage: {disk_usage} bytes")
        print(f"Run Duration: {run_duration} seconds")
        print(f"CPU Usage: {cpu_percent} %")
        print(f"CPU Num: {cpu_num}")
        print(f"CPU Times: {cpu_times}")
        print(f"Memory Info: {memory_info}")
        print(f"IO Counters: {io_counters}")
        # print(f"Network In: {network_bytes_in} bytes")
        # print(f"Network Out: {network_bytes_out} bytes")
        print("---")
        
        time.sleep(0.01)

    # Cancel the job when done
    job.cancel()

# Example usage
if __name__ == "__main__":
    load_dotenv('/home/shouei/GreenSecurity-FirstExperiment/SplunkResearch/src/.env')
    monitor_search_resources(
        host=os.getenv("SPLUNK_HOST"),
        port=os.getenv("SPLUNK_PORT"),
        username=os.getenv("SPLUNK_USERNAME"),
        password=os.getenv("SPLUNK_PASSWORD"),
        search_query='search `wineventlog_security` EventCode=5140 ShareName IN("\\\\*\\ADMIN$","\\\\*\\C$","*\\\\*\\IPC$") AccessMask= 0x1  earliest=1717437600.0 latest=1717452000.0| stats min(_time) as firstTime max(_time) as lastTime count by ShareName SourceAddress ObjectType AccountName AccountDomain SourcePort AccessMask Computer | `security_content_ctime(firstTime)` | `security_content_ctime(lastTime)` | `network_share_discovery_via_dir_command_filter`',
        # search_query='search `wineventlog_system` EventCode=7040  Message = "*service was changed from demand start to disabled."  earliest=1717408800.0 latest=1717416000.0| stats count min(_time) as firstTime max(_time) as lastTime by ComputerName EventCode Message User Sid service service_name | `security_content_ctime(firstTime)` | `security_content_ctime(lastTime)` | `windows_event_for_service_disabled_filter`',
        duration_minutes=3
    )

pid:  0
Timestamp: 2024-10-08 12:40:27.444310
Scan Count: 0
Event Count: 0
Result Count: 0
Disk Usage: 28672 bytes
Run Duration: 0 seconds
CPU Usage: 0 %
CPU Num: 0
CPU Times: 0
Memory Info: 0
IO Counters: 0
---
pid:  0
Timestamp: 2024-10-08 12:40:27.464351
Scan Count: 0
Event Count: 0
Result Count: 0
Disk Usage: 28672 bytes
Run Duration: 0 seconds
CPU Usage: 0 %
CPU Num: 0
CPU Times: 0
Memory Info: 0
IO Counters: 0
---
pid:  0
Timestamp: 2024-10-08 12:40:27.486769
Scan Count: 0
Event Count: 0
Result Count: 0
Disk Usage: 28672 bytes
Run Duration: 0 seconds
CPU Usage: 0 %
CPU Num: 0
CPU Times: 0
Memory Info: 0
IO Counters: 0
---
pid:  0
Timestamp: 2024-10-08 12:40:27.506717
Scan Count: 0
Event Count: 0
Result Count: 0
Disk Usage: 36864 bytes
Run Duration: 0 seconds
CPU Usage: 0 %
CPU Num: 0
CPU Times: 0
Memory Info: 0
IO Counters: 0
---
pid:  28617
Access denied to process with PID 28617.
