# Timeline Visualisation

## Definitions

1. **Crashes:** Timeouts / End-time is NA - however these two situations occur together in the dataset provided.
2. **Retries:** When an event happens twice. Assuming it could overlap also.
3. **Overlap:** When two or more different workers execute a task with the same event_id such that the time intervals from start and end of each event has a non-zero intersection with another such event.

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

def load_data(filepath):
    df = pd.read_csv(filepath)
    df['start_time'] = pd.to_datetime(df['start_time'], errors='coerce')
    df['end_time'] = pd.to_datetime(df['end_time'], errors='coerce')
    return df

def mark_retries(df):
    retries = df.groupby('event_id').filter(lambda x: len(x) > 1)['event_id'].unique()
    df['is_retry'] = df['event_id'].isin(retries)

def mark_overlaps(df):
    overlaps = []
    df_sorted = df.sort_values(['event_id', 'start_time'])
    for event_id, group in df_sorted.groupby('event_id'):
        if len(group) > 1:
            if (group['start_time'].iloc[1:] < group['end_time'].iloc[:-1].values).any():
                overlaps.append(event_id)
    df['is_overlap'] = df['event_id'].isin(overlaps)

def mark_failures(df):
    df['is_failure'] = (df['status'] == 'timeout')

def classify_events(df):
    df['retry_category'] = np.select(
        [(df['is_retry'] & df['is_overlap']), df['is_retry']],
        ['Retry and Overlap', 'Retry Only'],
        default='Not a Retry'
    )
    df['overlap_category'] = np.select(
        [(df['is_overlap'] & df['is_retry']), df['is_overlap']],
        ['Overlap and Retry', 'Overlap Only'],
        default='Not an Overlap'
    )

def plot_timeline(df):
    timeline_df = df.dropna(subset=['end_time'])
    if timeline_df.empty:
        return

    fig = px.timeline(
        timeline_df,
        x_start="start_time",
        x_end="end_time",
        y="worker_id",
        color="status",
        color_discrete_map={'success': 'seagreen', 'error': 'orange'},
        hover_data=["event_id"],
        title="Worker Timeline"
    )

    failures = df[df['is_failure']]
    if not failures.empty:
        fig.add_trace(go.Scatter(
            x=failures['start_time'],
            y=failures['worker_id'],
            mode='markers',
            marker=dict(symbol='x-thin', size=12, color='red', line=dict(width=3)),
            name='Failure / Timeout',
            text=[f'Event: {eid}' for eid in failures['event_id']]
        ))

    fig.update_yaxes(categoryorder="category descending")
    fig.show()

def plot_failures(df):
    failures = df[df['is_failure']]
    if failures.empty:
        return

    fig = px.scatter(
        failures,
        x="start_time",
        y="worker_id",
        color="worker_id",
        symbol="status",
        title="Timeline of Failures (Timeouts)",
        hover_data=['event_id', 'status']
    )
    fig.update_traces(marker=dict(size=12, line=dict(width=2)))
    fig.update_yaxes(categoryorder="category descending")
    fig.show()


In [2]:
df = load_data('/home/anshium/internship/apt-assignment/assessment_02/03_worker_logs.csv')
mark_retries(df)
mark_overlaps(df)
mark_failures(df)
classify_events(df)

print("Analysis completed.")
print("Hover over the graph and points to inspect details.")
plot_timeline(df)
plot_failures(df)

Analysis completed.
Hover over the graph and points to inspect details.


In [3]:
import plotly.express as px

def plot_retries(df):
    retries = df[df['is_retry'] & df['end_time'].notna()]
    if retries.empty:
        print("No retried events with valid end times.")
        return

    fig = px.scatter(
        retries,
        x="start_time",
        y="worker_id",
        color="retry_category",
        title="Retried Events",
        labels={"retry_category": "Event Type"},
        hover_data=["event_id", "status", "end_time"],
        color_discrete_map={
            'Retry Only': 'cornflowerblue',
            'Retry and Overlap': 'mediumpurple'
        }
    )
    fig.update_yaxes(categoryorder="category descending")
    fig.show()

def plot_overlaps(df):
    overlaps = df[df['is_overlap'] & df['end_time'].notna()]
    if overlaps.empty:
        print("No overlapping events with valid end times.")
        return

    fig = px.scatter(
        overlaps,
        x="start_time",
        y="worker_id",
        color="overlap_category",
        title="Overlapping Events",
        labels={"overlap_category": "Event Type"},
        hover_data=["event_id", "status", "end_time"],
        color_discrete_map={
            'Overlap Only': 'lightseagreen',
            'Overlap and Retry': 'mediumpurple'
        }
    )
    fig.update_yaxes(categoryorder="category descending")
    fig.show()


In [4]:
plot_retries(df)
plot_overlaps(df)

#### We observe that all events with the same event id happening more than once are overlapping also in this particular dataset of worker logs.