In [133]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Load Data

In [134]:
df = pd.read_parquet('../data/training_logs.parquet')

df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"Total logs: {len(df):,}")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Total logs: 131,935
Date range: 2025-12-17 23:44:21.030517+00:00 to 2025-12-26 23:44:14.821532+00:00

Columns: ['timestamp', 'level', 'service', 'message', 'is_anomaly', 'anomaly_type']


Unnamed: 0,timestamp,level,service,message,is_anomaly,anomaly_type
0,2025-12-17 23:44:21.030517+00:00,DEBUG,auth-service,Cache miss for key: cache_70,0,
1,2025-12-17 23:44:26.617160+00:00,WARN,api-gateway,Slow query detected - duration: 184ms,0,
2,2025-12-17 23:44:31.353882+00:00,INFO,notification-service,"Database query completed - rows: 297, duration...",0,
3,2025-12-17 23:44:38.295143+00:00,DEBUG,user-service,Cache miss for key: cache_89,0,
4,2025-12-17 23:44:44.889385+00:00,INFO,order-service,User session created - session_id: sess_665158,0,


## 1. Missing Values Check

In [135]:
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})

print("Missing Values Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

if missing.sum() == 0:
    print("\nâœ“ No missing values found!")


# expected since this is not present for the non-anomaly data points.

Missing Values Summary:
              Missing Count  Percentage
anomaly_type         131164   99.415621


## 2. Distribution Analysis - Log Severity Levels

In [136]:
# Severity distribution.
severity_counts = df['level'].value_counts()
severity_pct = (severity_counts / len(df)) * 100

print("Severity Level Distribution:")
for severity, count in severity_counts.items():
    pct = severity_pct[severity]
    print(f"  {severity:10s}: {count:7,} ({pct:5.2f}%)")

Severity Level Distribution:
  INFO      :  85,358 (64.70%)
  DEBUG     :  25,933 (19.66%)
  WARN      :  13,378 (10.14%)
  ERROR     :   6,295 ( 4.77%)
  FATAL     :     971 ( 0.74%)


In [137]:
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Log Severity Distribution', 'Severity Level Proportion'),
    specs=[[{'type': 'bar'}, {'type': 'pie'}]]
)

# Bar chart.
fig.add_trace(
    go.Bar(
        x=severity_counts.index,
        y=severity_counts.values,
        marker_color='steelblue',
        text=severity_counts.values,
        textposition='auto'
    ),
    row=1, col=1
)

# Pie chart.
fig.add_trace(
    go.Pie(
        labels=severity_counts.index,
        values=severity_counts.values,
        textinfo='label+percent'
    ),
    row=1, col=2
)

fig.update_xaxes(title_text='Severity Level', row=1, col=1)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_layout(height=500, showlegend=False)
fig.show()

## 3. Temporal Patterns

In [138]:
# Extract temporal features.
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.day_name()
df['date'] = df['timestamp'].dt.date

### Hourly Pattern

In [139]:
# Logs per hour.
hourly = df.groupby('hour').size()

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=hourly.index,
        y=hourly.values,
        marker_color='coral',
        text=hourly.values,
        textposition='auto'
    )
)

fig.update_layout(
    title='Log Frequency by Hour of Day',
    xaxis_title='Hour',
    yaxis_title='Number of Logs',
    height=500
)
fig.show()

print("\nPeak hours:")
print(hourly.nlargest(3))


Peak hours:
hour
13    6864
16    6810
17    6809
dtype: int64


### Daily Pattern

In [140]:
# Logs per day of week.
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily = df.groupby('day_of_week').size().reindex(day_order)

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=daily.index,
        y=daily.values,
        marker_color='mediumseagreen',
        text=daily.values,
        textposition='auto'
    )
)

fig.update_layout(
    title='Log Frequency by Day of Week',
    xaxis_title='Day of Week',
    yaxis_title='Number of Logs',
    height=500
)
fig.show()

### Time Series View

In [141]:
# Resample to see log volume over time.
time_series = df.set_index('timestamp').resample('1h').size().reset_index(name='count')

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=time_series['timestamp'],
        y=time_series['count'],
        mode='lines',
        line=dict(color='darkblue', width=2),
        fill='tozeroy',
        fillcolor='rgba(70, 130, 180, 0.3)'
    )
)

fig.update_layout(
    title='Log Volume Over Time (Hourly)',
    xaxis_title='Timestamp',
    yaxis_title='Number of Logs',
    height=500,
    hovermode='x unified'
)
fig.show()

## 4. Service Distribution

In [142]:
# Services generating most logs.
service_counts = df['service'].value_counts()
service_pct = (service_counts / len(df)) * 100

print("Service Distribution:")
for service, count in service_counts.items():
    pct = service_pct[service]
    print(f"  {service:20s}: {count:7,} ({pct:5.2f}%)")

Service Distribution:
  api-gateway         :  19,070 (14.45%)
  inventory-service   :  18,966 (14.38%)
  auth-service        :  18,948 (14.36%)
  payment-service     :  18,853 (14.29%)
  order-service       :  18,770 (14.23%)
  user-service        :  18,723 (14.19%)
  notification-service:  18,605 (14.10%)


In [143]:
# Create subplots with horizontal bar and pie charts.
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Logs per Service', 'Service Proportion'),
    specs=[[{'type': 'bar'}, {'type': 'pie'}]]
)

# Horizontal bar chart.
fig.add_trace(
    go.Bar(
        y=service_counts.index,
        x=service_counts.values,
        orientation='h',
        marker_color='teal',
        text=service_counts.values,
        textposition='auto'
    ),
    row=1, col=1
)

# Pie chart.
fig.add_trace(
    go.Pie(
        labels=service_counts.index,
        values=service_counts.values,
        textinfo='label+percent'
    ),
    row=1, col=2
)

fig.update_xaxes(title_text='Number of Logs', row=1, col=1)
fig.update_yaxes(title_text='Service Name', row=1, col=1)
fig.update_layout(height=500, showlegend=False)
fig.show()

In [144]:
# Overall summary.
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Total logs: {len(df):,}")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Number of services: {df['service'].nunique()}")
print(f"Severity levels: {', '.join(df['level'].unique())}")
print(f"Missing values: {df.isnull().sum().sum()}")



SUMMARY
Total logs: 131,935
Date range: 2025-12-17 23:44:21.030517+00:00 to 2025-12-26 23:44:14.821532+00:00
Number of services: 7
Severity levels: DEBUG, WARN, INFO, ERROR, FATAL
Missing values: 131164
