# Access Ticket Prediction

This notebook walks you through a **minimal viable pipeline** to:
1. Load historical access‑request tickets.
2. Engineer features that need *no extra systems*.
3. Train two models:
   * **WHO** will open a ticket in the next 7 days (classification / ranking).
   * **WHEN** that user will open the ticket (survival analysis).
4. Evaluate and inspect outputs.

---

## 0  Install / import dependencies

In [3]:
# Uncomment if running on a fresh environment
#!pip install pandas numpy scikit-learn xgboost lifelines matplotlib seaborn --quiet
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from pathlib import Path
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score
from xgboost import XGBClassifier
from lifelines import CoxPHFitter


## 1  Load data

In [4]:
# Point to your CSV (sample shipped with this repo / environment)
data_path = Path('sample_access_tickets.csv')
df_raw = pd.read_csv(data_path, parse_dates=['open_datetime'])
print(df_raw.shape)
df_raw.head()


(500, 7)


  df_raw = pd.read_csv(data_path, parse_dates=['open_datetime'])


Unnamed: 0,requester_id,open_datetime,ticket_type,application_name,errors_401_last24h,errors_403_last24h,user_active
0,U004,2024-10-30 11:10:00,access_request,App_C,0,1,1
1,U037,2025-02-26 21:10:00,access_request,App_B,0,0,1
2,U003,2025-05-26 03:44:00,access_request,App_C,0,0,1
3,U011,2024-08-26 08:43:00,elevated_privilege,App_A,2,2,1
4,U042,2024-10-30 16:09:00,access_request,App_E,3,0,1


## 2  Basic EDA

In [5]:
print(df_raw['ticket_type'].value_counts(dropna=False))
print(df_raw['requester_id'].nunique(), 'unique users')
df_raw['open_datetime'].min(), df_raw['open_datetime'].max()


ticket_type
elevated_privilege    256
access_request        244
Name: count, dtype: int64
50 unique users


(Timestamp('2024-07-02 08:14:00'), Timestamp('2025-07-21 18:15:00'))

## 3  Feature engineering

In [6]:
df = df_raw.copy()
# Basic temporal features
df['dow'] = df['open_datetime'].dt.dayofweek  # 0=Mon
df['hour'] = df['open_datetime'].dt.hour

# Sort for per‑user calculations
df = df.sort_values(['requester_id', 'open_datetime']).reset_index(drop=True)

# Days since last ticket per user
df['prev_open'] = df.groupby('requester_id')['open_datetime'].shift(1)
df['days_since_prev'] = (df['open_datetime'] - df['prev_open']).dt.total_seconds() / 86400
df['days_since_prev'].fillna(df['days_since_prev'].median(), inplace=True)

# Rolling ticket counts (past 7 and 30 days)
for window in [7, 30]:
    col = f'cnt_{window}d'
    df[col] = (
        df.groupby('requester_id')['open_datetime']
          .transform(lambda s: s.rolling(f'{window}D').count())
    )
    df[col].fillna(0, inplace=True)

feature_cols = ['dow', 'hour', 'days_since_prev', 'cnt_7d', 'cnt_30d',
                'errors_401_last24h', 'errors_403_last24h', 'user_active']
df[feature_cols].head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['days_since_prev'].fillna(df['days_since_prev'].median(), inplace=True)


ValueError: window must be an integer 0 or greater

### 3.1  Labels for *WHO* model

In [None]:
H = 7  # prediction horizon in days
df['next_open'] = df.groupby('requester_id')['open_datetime'].shift(-1)
df['days_to_next'] = (df['next_open'] - df['open_datetime']).dt.total_seconds() / 86400
df['y_who'] = (df['days_to_next'] <= H).astype(int).fillna(0)
print(df['y_who'].value_counts(normalize=True))


## 4  Train / test split (time‑based)

In [None]:
cutoff_date = df['open_datetime'].quantile(0.8)
train = df[df['open_datetime'] <= cutoff_date]
test  = df[df['open_datetime']  > cutoff_date]
X_train, y_train = train[feature_cols], train['y_who']
X_test,  y_test  = test[feature_cols],  test['y_who']
print(train.shape, test.shape)


## 5  Train *WHO* model (XGBoost)

In [None]:
model_who = XGBClassifier(
    n_estimators=250,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='aucpr',
    random_state=42
)
model_who.fit(X_train, y_train)
pred_probs = model_who.predict_proba(X_test)[:,1]
ap = average_precision_score(y_test, pred_probs)
roc = roc_auc_score(y_test, pred_probs)
print(f'Average Precision (PR AUC): {ap:.3f}\nROC AUC: {roc:.3f}')


In [None]:
prec, recall, thr = precision_recall_curve(y_test, pred_probs)
plt.figure(figsize=(4,3))
plt.step(recall, prec, where='post')
plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('WHO model PR curve')
plt.show()


### 5.1  Top‑10 user predictions for next horizon

In [None]:
# Take most recent record per user in test set
latest_test = test.sort_values('open_datetime').groupby('requester_id').tail(1)
scores = model_who.predict_proba(latest_test[feature_cols])[:,1]
top10 = (
    latest_test.assign(score=scores)
    .sort_values('score', ascending=False)
    .head(10)[['requester_id', 'open_datetime', 'score']]
)
top10


## 6  Train *WHEN* model (Cox Proportional Hazards)

In [None]:
# Prepare survival dataset per user
surv = (
    df.groupby('requester_id')
      .apply(lambda g: pd.Series({
          'duration': (g['open_datetime'].max() - g['open_datetime'].min()).days + 0.1,
          'event': 1  # at least one ticket observed
      }))
      .reset_index()
)

# Aggregate user‑level features (mean)
user_feats = df.groupby('requester_id')[feature_cols].mean().reset_index()
surv_df = pd.merge(surv, user_feats, on='requester_id')

# Train/test split
train_u, test_u = train_test_split(surv_df, test_size=0.2, random_state=42)

cph = CoxPHFitter()
cph.fit(train_u.drop(columns=['requester_id']), duration_col='duration', event_col='event')
cph.print_summary()

# Concordance on test set
from lifelines.utils import concordance_index
pred_surv = -cph.predict_partial_hazard(test_u)
cindex = concordance_index(test_u['duration'], pred_surv, test_u['event'])
print(f'C‑index on held‑out users: {cindex:.3f}')
