<a href="https://colab.research.google.com/github/slrico/Log-clickstream-Analysis/blob/main/Apache_Spark_Log_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

df = pd.read_csv('./Apache_2k.log_structured.csv')
df.head()


Unnamed: 0,LineId,Time,Level,Content,EventId,EventTemplate
0,1,Sun Dec 04 04:47:44 2005,notice,workerEnv.init() ok /etc/httpd/conf/workers2.p...,E2,workerEnv.init() ok <*>
1,2,Sun Dec 04 04:47:44 2005,error,mod_jk child workerEnv in error state 6,E3,mod_jk child workerEnv in error state <*>
2,3,Sun Dec 04 04:51:08 2005,notice,jk2_init() Found child 6725 in scoreboard slot 10,E1,jk2_init() Found child <*> in scoreboard slot <*>
3,4,Sun Dec 04 04:51:09 2005,notice,jk2_init() Found child 6726 in scoreboard slot 8,E1,jk2_init() Found child <*> in scoreboard slot <*>
4,5,Sun Dec 04 04:51:09 2005,notice,jk2_init() Found child 6728 in scoreboard slot 6,E1,jk2_init() Found child <*> in scoreboard slot <*>


In [15]:
# Assuming your DataFrame is df, and 'Content' contains your log lines

# Define your extract_template() as above
import re

def extract_template(content):
    # Replace file paths
    content = re.sub(r'/[^ ]+', '<PATH>', content)
    # Replace IP addresses
    content = re.sub(r'\b\d{1,3}(?:\.\d{1,3}){3}\b', '<IP>', content)
    # Replace all numbers
    content = re.sub(r'\b\d+\b', '<ID>', content)
    # Replace dates/times (example pattern)
    content = re.sub(r'(Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+[A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4}', '<DATETIME>', content)
    # Replace placeholders
    content = re.sub(r'<\*>', '<PLACEHOLDER>', content)
    # Keep function names, remove parentheses
    content = re.sub(r'(\b[A-Za-z_][A-Za-z0-9_]*)(\(\))', r'\1', content)
    # Normalize spaces
    content = re.sub(r'\s+', ' ', content).strip()
    return content

# Apply to create templates
df['Template'] = df['Content'].apply(extract_template)

# Now you can group by templates
template_counts = df['Template'].value_counts()

# Or get all logs belonging to a specific template
grouped = df.groupby('Template')

In [9]:
import difflib

# Store original content before cleaning
df['Original_Content'] = df['Content']

# Reapply your cleaning function
df['Cleaned_Content'] = df['Content'].apply(clean_content)

# Function to generate a detailed diff
def generate_diff(orig, cleaned):
    diff = difflib.ndiff(orig.splitlines(), cleaned.splitlines())
    return '\n'.join(diff)

# Create a column with the diff
df['Difference'] = df.apply(
    lambda row: generate_diff(row['Original_Content'], row['Cleaned_Content']), axis=1
)

# Show a sample of differences
print(df[['LineId', 'Difference']].head(10))

   LineId                                         Difference
0       1  - workerEnv.init() ok /etc/httpd/conf/workers2...
1       2  - mod_jk child workerEnv in error state 6\n?  ...
2       3  - jk2_init() Found child 6725 in scoreboard sl...
3       4  - jk2_init() Found child 6726 in scoreboard sl...
4       5  - jk2_init() Found child 6728 in scoreboard sl...
5       6  - workerEnv.init() ok /etc/httpd/conf/workers2...
6       7  - workerEnv.init() ok /etc/httpd/conf/workers2...
7       8  - workerEnv.init() ok /etc/httpd/conf/workers2...
8       9  - mod_jk child workerEnv in error state 6\n?  ...
9      10  - mod_jk child workerEnv in error state 6\n?  ...


In [10]:
import re

def extract_variables(content):
    variables = {}

    # Extract file paths
    paths = re.findall(r'/[^ ]+', content)
    variables['paths'] = paths if paths else []

    # Extract IP addresses
    ips = re.findall(r'\b\d{1,3}(?:\.\d{1,3}){3}\b', content)
    variables['ips'] = ips if ips else []

    # Extract numeric IDs
    ids = re.findall(r'\b\d+\b', content)
    variables['ids'] = ids if ids else []

    # Extract timestamps (example pattern)
    timestamps = re.findall(r'(Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+[A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4}', content)
    variables['timestamps'] = timestamps if timestamps else []

    # You can add more regexes for other variable parts as needed

    return variables

# Apply this to your DataFrame:
df['Extracted_Variables'] = df['Content'].apply(extract_variables)

In [11]:
from collections import Counter

# Count most common file paths
all_paths = sum(df['Extracted_Variables'].apply(lambda x: x['paths']), [])
common_paths = Counter(all_paths).most_common(10)
print("Most common paths:", common_paths)

Most common paths: [('/etc/httpd/conf/workers2.properties', 569), ('/var/www/html/', 32)]


In [16]:
import re
import pandas as pd
from collections import Counter

# 1. Define your extract_template() function
def extract_template(content):
    # Replace file paths
    content = re.sub(r'/[^ ]+', '<PATH>', content)
    # Replace IP addresses
    content = re.sub(r'\b\d{1,3}(?:\.\d{1,3}){3}\b', '<IP>', content)
    # Replace all numbers
    content = re.sub(r'\b\d+\b', '<ID>', content)
    # Replace dates/times
    content = re.sub(r'(Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+[A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4}', '<DATETIME>', content)
    # Replace placeholders
    content = re.sub(r'<\*>', '<PLACEHOLDER>', content)
    # Keep function names
    content = re.sub(r'(\b[A-Za-z_][A-Za-z0-9_]*)(\(\))', r'\1', content)
    # Normalize spaces
    content = re.sub(r'\s+', ' ', content).strip()
    return content

# 2. Apply extract_template() to create 'Template' column
df['Template'] = df['Content'].apply(extract_template)

# 3. Now, extract variables for each content
def extract_variables(content):
    variables = {}
    variables['paths'] = re.findall(r'/[^ ]+', content)
    variables['ips'] = re.findall(r'\b\d{1,3}(?:\.\d{1,3}){3}\b', content)
    variables['ids'] = re.findall(r'\b\d+\b', content)
    variables['timestamps'] = re.findall(r'(Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+[A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4}', content)
    return variables

# 4. Apply variable extraction on original content
df['Variables'] = df['Content'].apply(extract_variables)

# 5. Group by 'Template'
grouped = df.groupby('Template')

# 6. Analyze each group
for template, group in grouped:
    print(f"\nTemplate: {template}")
    print(f"Number of messages: {len(group)}")

    # Aggregate all variable values within group
    all_paths = sum(group['Variables'].apply(lambda x: x['paths']).tolist(), [])
    all_ips = sum(group['Variables'].apply(lambda x: x['ips']).tolist(), [])
    all_ids = sum(group['Variables'].apply(lambda x: x['ids']).tolist(), [])
    all_timestamps = sum(group['Variables'].apply(lambda x: x['timestamps']).tolist(), [])

    # Show most common variable values
    print("Most common paths:", Counter(all_paths).most_common(5))
    print("Most common IPs:", Counter(all_ips).most_common(5))
    print("Most common IDs:", Counter(all_ids).most_common(5))
    print("Most common Timestamps:", Counter(all_timestamps).most_common(3))


Template: [client <IP>] Directory index forbidden by rule: <PATH>
Number of messages: 32
Most common paths: [('/var/www/html/', 32)]
Most common IPs: [('222.166.160.184', 1), ('63.13.186.196', 1), ('147.31.138.75', 1), ('207.203.80.15', 1), ('218.76.139.20', 1)]
Most common IDs: [('218', 6), ('216', 4), ('61', 4), ('207', 3), ('15', 3)]
Most common Timestamps: []

Template: jk2_init Can't find child <ID> in scoreboard
Number of messages: 12
Most common paths: []
Most common IPs: []
Most common IDs: [('1566', 1), ('1567', 1), ('2082', 1), ('2085', 1), ('2086', 1)]
Most common Timestamps: []

Template: jk2_init Found child <ID> in scoreboard slot <ID>
Number of messages: 836
Most common paths: []
Most common IPs: []
Most common IDs: [('8', 194), ('7', 194), ('6', 189), ('9', 160), ('10', 75)]
Most common Timestamps: []

Template: mod_jk child init <ID> -<ID>
Number of messages: 12
Most common paths: []
Most common IPs: []
Most common IDs: [('1', 12), ('2', 12)]
Most common Timestamps: [

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Cleaned_Content_Encoded'] = label_encoder.fit_transform(df['Cleaned_Content'])
#print(df[['LineId', 'Cleaned_Content', 'Cleaned_Content_Encoded']])


In [None]:
def clean_event_template(template):
    # Remove placeholders like <*>
    cleaned = re.sub(r'<\*\>', '', template)

    # Replace numeric identifiers with <ID>
    cleaned = re.sub(r'\b\d+\b', '<ID>', cleaned)

    # Remove specific unwanted keywords while preserving function/command patterns
    cleaned = re.sub(r'\b(state)\b', '', cleaned, flags=re.IGNORECASE)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()  # Normalize whitespace

    return cleaned

# Apply the cleaning function to the EventTemplate column
df['Cleaned_EventTemplate'] = df['EventTemplate'].apply(clean_event_template)

# Show the resulting DataFrame
print(df[['LineId', 'EventTemplate', 'Cleaned_EventTemplate']])

      LineId                                      EventTemplate  \
0          1                            workerEnv.init() ok <*>   
1          2          mod_jk child workerEnv in error state <*>   
2          3  jk2_init() Found child <*> in scoreboard slot <*>   
3          4  jk2_init() Found child <*> in scoreboard slot <*>   
4          5  jk2_init() Found child <*> in scoreboard slot <*>   
...      ...                                                ...   
1995    1996          mod_jk child workerEnv in error state <*>   
1996    1997  jk2_init() Found child <*> in scoreboard slot <*>   
1997    1998  jk2_init() Found child <*> in scoreboard slot <*>   
1998    1999                            workerEnv.init() ok <*>   
1999    2000          mod_jk child workerEnv in error state <*>   

                          Cleaned_EventTemplate  
0                           workerEnv.init() ok  
1               mod_jk child workerEnv in error  
2     jk2_init() Found child in scoreboard sl

In [None]:
# Apply the cleaning function to the EventTemplate column
df['Cleaned_EventTemplate'] = df['EventTemplate'].apply(clean_event_template)

# Create LabelEncoder instance
label_encoder = LabelEncoder()

# Label encode the Cleaned_EventTemplate
df['Cleaned_EventTemplate_Encoded'] = label_encoder.fit_transform(df['Cleaned_EventTemplate'])

# Show the resulting DataFrame
print(df[['LineId', 'EventTemplate', 'Cleaned_EventTemplate', 'Cleaned_EventTemplate_Encoded']])

      LineId                                      EventTemplate  \
0          1                            workerEnv.init() ok <*>   
1          2          mod_jk child workerEnv in error state <*>   
2          3  jk2_init() Found child <*> in scoreboard slot <*>   
3          4  jk2_init() Found child <*> in scoreboard slot <*>   
4          5  jk2_init() Found child <*> in scoreboard slot <*>   
...      ...                                                ...   
1995    1996          mod_jk child workerEnv in error state <*>   
1996    1997  jk2_init() Found child <*> in scoreboard slot <*>   
1997    1998  jk2_init() Found child <*> in scoreboard slot <*>   
1998    1999                            workerEnv.init() ok <*>   
1999    2000          mod_jk child workerEnv in error state <*>   

                          Cleaned_EventTemplate  Cleaned_EventTemplate_Encoded  
0                           workerEnv.init() ok                              5  
1               mod_jk child work

In [None]:
# have a timeseries plot
# the target is the level
# find large models for classification task

In [None]:
!pip install transformers==4.31.0

from transformers import XLNetForSequenceClassification, XLNetTokenizer

model_name = "xlnet-base-cased"  # Choose an XLNet variant
tokenizer = XLNetTokenizer.from_pretrained(model_name)
model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=your_num_labels)


In [None]:
!pip install transformers==4.31.0

from transformers import ElectraForSequenceClassification, ElectraTokenizer

model_name = "google/electra-base-discriminator"  # Choose an ELECTRA variant
tokenizer = ElectraTokenizer.from_pretrained(model_name)
model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=your_num_labels)

In [None]:
!pip install transformers==4.31.0

from transformers import DebertaForSequenceClassification, DebertaTokenizer

model_name = "microsoft/deberta-base"  # Choose a DeBERTa variant
tokenizer = DebertaTokenizer.from_pretrained(model_name)
model = DebertaForSequenceClassification.from_pretrained(model_name, num_labels=your_num_labels)

In [None]:
!pip install transformers==4.31.0

from transformers import LongformerForSequenceClassification, LongformerTokenizer

model_name = "allenai/longformer-base-4096"  # Choose a Longformer variant
tokenizer = LongformerTokenizer.from_pretrained(model_name)
model = LongformerForSequenceClassification.from_pretrained(model_name, num_labels=your_num_labels)


In [None]:
# Apache Spark Models
from pyspark.ml.classification import GBTClassifier

# Assuming 'df' is your Spark DataFrame with features and label column
gbt = GBTClassifier(featuresCol='features', labelCol='label')
gbt_model = gbt.fit(df)

# Make predictions
predictions = gbt_model.transform(df)


# Better Visualizations and Story telling