# Survival Analysis Tutorial Part 2


The goal of this tutorial is to extract implicit failure information from a raw event log using [Ibis](ibis-project.org/) and [DuckDB](https://duckdb.org)

In [1]:
import ibis

ibis.options.interactive = True
ibis.__version__

'5.1.0'

In [2]:
import duckdb

duckdb.__version__

'0.7.1'

In [3]:
from urllib.request import urlretrieve
from pathlib import Path

data_filepath = Path("wowah_data_raw.parquet")
data_url = (
    "https://storage.googleapis.com/ibis-tutorial-data/wowah_data/"
    "wowah_data_raw.parquet"
)

if not data_filepath.exists():
    print(f"Downloading {data_url}...")
    urlretrieve(data_url, data_filepath)
else:
    print(f"Reusing downloaded {data_filepath}")

Reusing downloaded wowah_data_raw.parquet


In [4]:
conn = ibis.duckdb.connect()  # in-memory DuckDB
transactions = conn.read_parquet(data_filepath)
transactions

In [5]:
transactions.count().execute() / 1e6

10.826734

In [6]:
from ibis import deferred as c


entity_window = ibis.cumulative_window(
    group_by=c.char, order_by=c.timestamp
)
threshold = ibis.interval(minutes=30)
deadline_date = c.timestamp.lag().over(entity_window) + threshold

(
    transactions
    .select([c.char, c.timestamp])
    .mutate(deadline_date=deadline_date)
)

In [7]:
(
    transactions
    .select([c.char, c.timestamp])
    .mutate(
        is_new_session=(c.timestamp > deadline_date).fillna(False)
    )
)

In [8]:
(
    transactions
    .select([c.char, c.timestamp])
    .mutate(
        is_new_session=(c.timestamp > deadline_date).fillna(False)
    )
    .mutate(session_id=c.is_new_session.sum().over(entity_window))
)

In [9]:
entity_window = ibis.cumulative_window(
    group_by=c.char, order_by=c.timestamp
)
threshold = ibis.interval(minutes=30)
deadline_date = c.timestamp.lag().over(entity_window) + threshold
is_new_session = (c.timestamp > deadline_date).fillna(False)

sessionized = (
    transactions
    .mutate(is_new_session=is_new_session)
    .mutate(session_id=c.is_new_session.sum().over(entity_window))
    .drop("is_new_session")
)
sessions = (
    sessionized
    .group_by([c.char, c.session_id])
    .aggregate(
        session_start_date=c.timestamp.min(),
        session_end_date=c.timestamp.max(),
    )
    .order_by([c.char, c.session_start_date])
)
sessions.count().execute() / 1e6

1.142606

In [10]:
# ibis.show_sql(sessions)

In [11]:
def sessionize(table, threshold, entity_col, date_col):
    entity_window = ibis.cumulative_window(
        group_by=entity_col, order_by=date_col
    )
    deadline_date = date_col.lag().over(entity_window) + threshold
    is_new_session = (date_col > deadline_date).fillna(False)

    return (
        table
        .mutate(is_new_session=is_new_session)
        .mutate(session_id=c.is_new_session.sum().over(entity_window))
        .drop("is_new_session")
    )


def extract_sessions(table, entity_col, date_col, session_col):
    return (
        table
        .group_by([entity_col, session_col])
        .aggregate(
            session_start_date=date_col.min(),
            session_end_date=date_col.max(),
        )
        .order_by([entity_col, c.session_start_date])
    )


def preprocess_transactions(transactions):
    return (
        transactions
        .pipe(
            sessionize,
            threshold=ibis.interval(minutes=30),
            entity_col=c.char,
            date_col=c.timestamp,
        )
        .pipe(
            extract_sessions,
            entity_col=c.char,
            date_col=c.timestamp,
            session_col=c.session_id,
        )
    )

In [12]:
sessions = preprocess_transactions(transactions)
%time sessions.count().execute() / 1e6

CPU times: user 13.7 s, sys: 571 ms, total: 14.3 s
Wall time: 3.94 s


1.142606

In [13]:
%time sessions_df = sessions.to_pandas()
sessions_df

CPU times: user 14.8 s, sys: 575 ms, total: 15.4 s
Wall time: 5.88 s


Unnamed: 0,char,session_id,session_start_date,session_end_date
0,2,0,2008-12-03 10:41:47,2008-12-03 10:41:47
1,7,0,2008-01-15 21:47:09,2008-01-16 00:26:56
2,7,1,2008-01-16 21:57:02,2008-01-17 01:16:49
3,7,2,2008-01-17 18:47:07,2008-01-18 00:07:32
4,7,3,2008-01-18 23:17:13,2008-01-19 01:47:16
...,...,...,...,...
1142601,90576,0,2008-12-31 22:06:58,2008-12-31 23:07:13
1142602,90577,0,2008-12-31 22:17:35,2008-12-31 22:47:54
1142603,90578,0,2008-12-31 22:32:52,2008-12-31 22:32:52
1142604,90579,0,2008-12-31 22:44:45,2008-12-31 22:44:45


In [14]:
# ibis.show_sql(preprocess_transactions(transactions))

In [15]:
import polars as pl


pl.__version__

'0.17.11'

In [16]:
transactions_df = pl.read_parquet(data_filepath)
transactions_df.head(5)

char,level,race,charclass,zone,guild,timestamp
i32,i32,str,str,str,i32,datetime[μs]
59425,1,"""Orc""","""Rogue""","""Orgrimmar""",165,2008-01-01 00:02:04
65494,9,"""Orc""","""Hunter""","""Durotar""",-1,2008-01-01 00:02:04
65325,14,"""Orc""","""Warrior""","""Ghostlands""",-1,2008-01-01 00:02:04
65490,18,"""Orc""","""Hunter""","""Ghostlands""",-1,2008-01-01 00:02:04
2288,60,"""Orc""","""Hunter""","""Hellfire Penin…",-1,2008-01-01 00:02:09


In [17]:
transactions_lazy_df = pl.scan_parquet(data_filepath)
transactions_lazy_df.head(10)

In [18]:
transactions_lazy_df.head(10).collect()

char,level,race,charclass,zone,guild,timestamp
i32,i32,str,str,str,i32,datetime[μs]
59425,1,"""Orc""","""Rogue""","""Orgrimmar""",165,2008-01-01 00:02:04
65494,9,"""Orc""","""Hunter""","""Durotar""",-1,2008-01-01 00:02:04
65325,14,"""Orc""","""Warrior""","""Ghostlands""",-1,2008-01-01 00:02:04
65490,18,"""Orc""","""Hunter""","""Ghostlands""",-1,2008-01-01 00:02:04
2288,60,"""Orc""","""Hunter""","""Hellfire Penin…",-1,2008-01-01 00:02:09
2289,60,"""Orc""","""Hunter""","""Hellfire Penin…",-1,2008-01-01 00:02:09
61239,68,"""Orc""","""Hunter""","""Blade's Edge M…",243,2008-01-01 00:02:14
59772,69,"""Orc""","""Warrior""","""Shadowmoon Val…",35,2008-01-01 00:02:14
22937,69,"""Orc""","""Rogue""","""Warsong Gulch""",243,2008-01-01 00:02:14
23062,69,"""Orc""","""Shaman""","""Shattrath City…",103,2008-01-01 00:02:14


In [19]:
def sessionize_pl(df, entity_col, date_col, threshold):
    sessionized = (
        df.sort([entity_col, date_col])
        .with_columns(
            [
                (pl.col(date_col).diff().over(entity_col).dt.minutes() > threshold)
                .fill_null(False)
                .alias("is_new_session"),
            ]
        )
        .with_columns(
            [
                pl.col("is_new_session").cumsum().over(entity_col).alias("session_id"),
            ]
        )
        .drop(["is_new_session"])
    )
    return sessionized

def extract_sessions_pl(df, entity_col, date_col, session_col):
    sessions = (
        df.groupby([entity_col, session_col])
        .agg(
            [
                pl.col(date_col).min().alias("session_start_date"),
                pl.col(date_col).max().alias("session_end_date"),
            ]
        )
        .sort([entity_col, "session_start_date"])
    )
    return sessions


def preprocess_transactions_pl(df):
    return (
        df
        .pipe(
            sessionize_pl,
            entity_col="char",
            date_col="timestamp",
            threshold=30,
        )
        .pipe(
            extract_sessions_pl,
            entity_col="char",
            date_col="timestamp",
            session_col="session_id",
        )
    )


%time sessions_collected = preprocess_transactions_pl(transactions_lazy_df).collect()
sessions_collected

CPU times: user 4.07 s, sys: 404 ms, total: 4.47 s
Wall time: 1.92 s


char,session_id,session_start_date,session_end_date
i32,u32,datetime[μs],datetime[μs]
2,0,2008-12-03 10:41:47,2008-12-03 10:41:47
7,0,2008-01-15 21:47:09,2008-01-16 00:26:56
7,1,2008-01-16 21:57:02,2008-01-17 01:16:49
7,2,2008-01-17 18:47:07,2008-01-18 00:07:32
7,3,2008-01-18 23:17:13,2008-01-19 01:47:16
7,4,2008-01-19 02:37:29,2008-01-19 02:47:13
7,5,2008-01-19 20:36:15,2008-01-19 23:46:28
7,6,2008-01-20 00:56:02,2008-01-20 04:36:29
7,7,2008-01-20 13:26:12,2008-01-20 15:55:57
7,8,2008-01-21 19:36:13,2008-01-22 00:46:11


In [20]:
sessions_collected.shape

(1123327, 4)