# Catch Me If You Can ("Alice") 

https://www.kaggle.com/c/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import hstack

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import chi2, SelectKBest, SelectFromModel

%matplotlib inline

In [2]:
time_cols = ['time' + str(i) for i in range(1, 11)]
df_train = pd.read_csv('datasets/alice/train_sessions.csv', index_col='session_id', parse_dates=time_cols)
df_test = pd.read_csv('datasets/alice/test_sessions.csv', index_col='session_id', parse_dates=time_cols)

df_train.sort_values(by='time1', inplace=True)

In [3]:
df_train.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 21669 to 204762
Data columns (total 21 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   site1   253561 non-null  int64         
 1   time1   253561 non-null  datetime64[ns]
 2   site2   250098 non-null  float64       
 3   time2   250098 non-null  datetime64[ns]
 4   site3   246919 non-null  float64       
 5   time3   246919 non-null  datetime64[ns]
 6   site4   244321 non-null  float64       
 7   time4   244321 non-null  datetime64[ns]
 8   site5   241829 non-null  float64       
 9   time5   241829 non-null  datetime64[ns]
 10  site6   239495 non-null  float64       
 11  time6   239495 non-null  datetime64[ns]
 12  site7   237297 non-null  float64       
 13  time7   237297 non-null  datetime64[ns]
 14  site8   235224 non-null  float64       
 15  time8   235224 non-null  datetime64[ns]
 16  site9   233084 non-null  float64       
 17  time9   233084 non-null  

In [5]:
df_train.isna().sum()

site1         0
time1         0
site2      3463
time2      3463
site3      6642
time3      6642
site4      9240
time4      9240
site5     11732
time5     11732
site6     14066
time6     14066
site7     16264
time7     16264
site8     18337
time8     18337
site9     20477
time9     20477
site10    22509
time10    22509
target        0
dtype: int64

In [6]:
df_train.shape, df_test.shape

((253561, 21), (82797, 20))

In this particular problem, the missing values on columns site and time, e.g, site7 and time7, it means the user didn't visit a 7th website. In other words, only visited 6 websites.

We'll fill missing values with a value of 0. For the moment, we'll work only with site features, then we apply feature engineering-selection to get better results.

In [7]:
columns = ['site' + str(i) for i in range(1, 11)] # ['site1', 'site2', 'site3', 'site4'...]

imputer = SimpleImputer(strategy='constant', fill_value=0)
train_imputed = imputer.fit_transform(df_train[columns])
test_imputed = imputer.transform(df_test[columns])

In [8]:
train_imputed.shape, test_imputed.shape

((253561, 10), (82797, 10))

In [9]:
train_imputed[:5]

array([[ 56.,  55.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
       [ 56.,  55.,  56.,  55.,   0.,   0.,   0.,   0.,   0.,   0.],
       [946., 946., 951., 946., 946., 945., 948., 784., 949., 946.],
       [945., 948., 949., 948., 945., 946., 947., 945., 946., 946.],
       [947., 950., 948., 947., 950., 952., 946., 951., 946., 947.]])

In [10]:
train_imputed = train_imputed.astype('int')
test_imputed = test_imputed.astype('int')

We're going to use `CountVectorizer` to make a vocabulary of sites and count their apperance in each sample.

1) We might save both datasets train and test imputed in documents for CountVectorizer processing.

2) We might convert both datasets into str and fit CountVectorizer with them.

In [11]:
# 1)
# np.savetxt('datasets/alice/train_sessions_test.txt', train_imputed, delimiter=' ')
# np.savetxt('datasets/alice/test_sessions_test.txt', test_imputed, delimiter=' ')

In [12]:
# 2)
train_imputed = [" ".join(row.astype('str')) for row in train_imputed]
test_imputed = [" ".join(row.astype('str')) for row in test_imputed]

In [13]:
%%time
counter = CountVectorizer(ngram_range=(1, 3), max_features=50000)
X_train = counter.fit_transform(train_imputed)
X_test = counter.transform(test_imputed)

CPU times: user 13.6 s, sys: 232 ms, total: 13.8 s
Wall time: 13.8 s


In [14]:
X_train.shape, X_test.shape

((253561, 50000), (82797, 50000))

In [15]:
y_train = df_train['target'].astype('int').ravel()
y_train.shape

(253561,)

Now, we can start building a machine learning model, however we can just do cross-validation as usual (`KFold` or another class) but as we're working with time series (even if we haven't added time features yet, remember that we ordered `df_train` by `time1`) we need to perform splitting in other way. Sklearn give us a class called `TimeSeriesSplit`.

In [16]:
%%time
time_split = TimeSeriesSplit(n_splits=10)
log_reg = LogisticRegression(solver='liblinear', random_state=42)
cv_auc_log = cross_val_score(log_reg, X_train, y_train, cv=time_split, n_jobs=-1, scoring='roc_auc')

CPU times: user 60.9 ms, sys: 112 ms, total: 173 ms
Wall time: 30.4 s


In [17]:
cv_auc_log.mean(), cv_auc_log.std()

(0.8677191954657827, 0.08472558562304884)

In [18]:
%%time
log_reg.fit(X_train, y_train)

CPU times: user 48.8 s, sys: 3.54 s, total: 52.4 s
Wall time: 23 s


LogisticRegression(random_state=42, solver='liblinear')

In [19]:
y_pred_prob_log = log_reg.predict_proba(X_test)[:, 1] # [n_samples, n_classes]

In [20]:
def submission_csv(pred, path_csv, id_col='session_id', target_col='target'):
    submit_df = pd.DataFrame(pred, columns=[target_col], index=np.arange(1, pred.shape[0] + 1))
    submit_df.rename_axis(id_col, inplace=True)
    submit_df.to_csv(path_csv)

In [21]:
submission_csv(y_pred_prob_log, 'datasets/alice/submit1.csv')

base log reg model

cv (time series split) roc auc -> mean = 0.867 ; std = 0.084

Public Leaderboard = 0.91288

Let's add time features but only the hour, if is in morning, day, afternoon and night. Why only this data? Because maybe there's no need to know the month or year when a user visited a website.

In [22]:
def extract_time_features(X_, df_):
    # time_cols is defined at the beginning of the notebook, just before import csv train/test
    dict_time = {}
        
    for col in time_cols:
        dict_time[col+'_hour'] = [time.hour for time in df_[col]]
        dict_time[col+'_is_morning'] = [1 if (hour >= 5 and hour < 12) else 0 for hour in dict_time[col+'_hour']]
        dict_time[col+'_is_afternoon']= [1 if (hour >= 12 and hour < 17) else 0 for hour in dict_time[col+'_hour']]
        dict_time[col+'_is_evening'] = [1 if (hour >= 17 and hour < 21) else 0 for hour in dict_time[col+'_hour']]
        dict_time[col+'_is_night'] = [1 if (hour >= 21 and hour < 5) else 0 for hour in dict_time[col+'_hour']]
    
    df_time = pd.DataFrame(dict_time).fillna(0)
    return hstack([X_, df_time])

In [23]:
%%time
X_train_2 = extract_time_features(X_train, df_train)
X_test_2 = extract_time_features(X_test, df_test)

CPU times: user 13.5 s, sys: 492 ms, total: 14 s
Wall time: 14.1 s


In [24]:
%%time
log_reg_2 = LogisticRegression(solver='sag', random_state=42)
cv_auc_log_2 = cross_val_score(log_reg_2, X_train_2, y_train, cv=time_split, n_jobs=-1, scoring='roc_auc')
cv_auc_log_2.mean(), cv_auc_log_2.std()

CPU times: user 128 ms, sys: 64 ms, total: 192 ms
Wall time: 53.3 s


(0.9152897189426366, 0.054716865679914614)

In [25]:
print("Before:",cv_auc_log.mean(), cv_auc_log.std())
print("Now:",cv_auc_log_2.mean(), cv_auc_log_2.std())

Before: 0.8677191954657827 0.08472558562304884
Now: 0.9152897189426366 0.054716865679914614


Pog!

In [26]:
%%time
log_reg_2.fit(X_train_2, y_train)

CPU times: user 23.2 s, sys: 21.5 ms, total: 23.2 s
Wall time: 23.2 s




LogisticRegression(random_state=42, solver='sag')

In [27]:
y_pred_prob_log_2 = log_reg_2.predict_proba(X_test_2)[:, 1]

submission_csv(y_pred_prob_log_2, 'datasets/alice/submit2.1.csv')

Public Leaderboard = 0.93124

Well, it's time to feature selection and then hyperparameter tuning.

In [28]:
def show_feature_importances(coefs, columns, abs_=False, normalize=False, ascending_=False):
    #flatten() returns a copy, so there's no need for set the parameter copy=True 
    df = pd.DataFrame(data=coefs.flatten(), columns=['importance'], index=columns) 
    if(abs_):
        df['importance'] = np.abs(df['importance'])
    if(normalize):
        df['importance'] = (df['importance']-min(df['importance']))/(max(df['importance'])-min(df['importance']))
    
    df.sort_values(by='importance', ascending=ascending_, inplace=True)
    return df

In [29]:
show_feature_importances(log_reg.coef_, counter.vocabulary_)[:3000]

Unnamed: 0,importance
29 1600,5.511434
678 775 678,3.446028
3846 3847 617,3.313141
34632 11106 11106,3.153049
30111,2.925741
...,...
76 1057 76,0.178418
39 22 570,0.177864
145 52,0.177834
186 1939,0.177215


In [30]:
selectk = SelectKBest(chi2, k=40000)
X_train_3 = selectk.fit_transform(X_train_2, y_train)
X_test_3 = selectk.transform(X_test_2)

In [31]:
%%time
log_reg_3 = LogisticRegression(solver='sag', random_state=42)
cv_auc_log_3 = cross_val_score(log_reg_3, X_train_3, y_train, cv=time_split, n_jobs=-1, scoring='roc_auc')
cv_auc_log_3.mean(), cv_auc_log_3.std()

CPU times: user 55.7 ms, sys: 51.9 ms, total: 108 ms
Wall time: 44.6 s


(0.9158598630917517, 0.05455492654126366)

In [32]:
%%time
log_reg_3.fit(X_train_3, y_train)

CPU times: user 21.1 s, sys: 19.8 ms, total: 21.1 s
Wall time: 21.3 s




LogisticRegression(random_state=42, solver='sag')

In [33]:
y_pred_prob_log_3 = log_reg_3.predict_proba(X_test_3)[:, 1]
submission_csv(y_pred_prob_log_3, 'datasets/alice/submit3.1.csv')

Hyperparameter tuning

In [146]:
c_values = np.logspace(-2, 2, 10)

log_reg_grid = GridSearchCV(log_reg_3, param_grid={'C': c_values}, scoring='roc_auc', n_jobs=-1,
                            cv=time_split, verbose=1)

In [147]:
%%time
log_reg_grid.fit(X_train_3, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
CPU times: user 1min 2s, sys: 3.79 s, total: 1min 5s
Wall time: 9min 3s


GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None),
             estimator=LogisticRegression(random_state=42, solver='liblinear'),
             n_jobs=-1,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             scoring='roc_auc', verbose=1)

In [148]:
log_reg_grid.best_score_, log_reg_grid.best_params_

(0.9208102253865565, {'C': 0.21544346900318834})

solver='liblinear' very pog :0

In [31]:
tuned_log_reg_3 = LogisticRegression(C=0.21544346900318834 , solver='liblinear', random_state=42)
tuned_log_reg_3.fit(X_train_3, y_train)
tuned_y_pred_prob_log_3 = tuned_log_reg_3.predict_proba(X_test_3)[:, 1]
submission_csv(tuned_y_pred_prob_log_3, 'datasets/alice/submit3_tuned.csv')

**Maybe this is cheating but...I watched some public kernels about "catch me if you can" :'v. I know more or less what I need to do, but, I'm going to do it in my way.**

PD: I don't now why it doesn't work :'d

Changes:
* I'll try to write code cleaner
* New features from time (session time start-end, lengh session, day of week, number of visited sites)
* Remove some created features (time2 to time10, morning, afternoon, etc.)

Let's try to organize better all code above.

**Functions**

In [7]:
def extract_features_from_time(df):
    new_features = {}
    new_features['start_session'] = df['time1'] # np.min(df.loc[:, time_cols], axis=1)
    new_features['num_sites'] = np.sum(df.loc[:, site_cols].notna(), axis=1)
    new_features['day_of_week'] = [date.day_of_week for date in new_features['start_session']]
    new_features['day_in_month'] = [date.days_in_month for date in new_features['start_session']]
    new_features['day_of_year'] = [date.day_of_year for date in new_features['start_session']]
    new_features['hour_session'] = [date.hour for date in new_features['start_session']]
    
#     new_features['is_morning'] = [1 if (hour >= 5 and hour < 12) else 0 for hour in new_features['hour_session']]
#     new_features['is_afternoon']=[1 if (hour >= 12 and hour < 17) else 0 for hour in new_features['hour_session']]
#     new_features['is_evening'] = [1 if (hour >= 17 and hour < 21) else 0 for hour in new_features['hour_session']]
#     new_features['is_night'] = [1 if (hour >= 21 and hour < 5) else 0 for hour in new_features['hour_session']]
    
    df_features = pd.DataFrame(new_features)
    df_features.sort_values(by='start_session', inplace=True)
    df_features.drop(['start_session'], axis=1, inplace=True)   
    return df_features

def roc_auc_cv_log_reg(X_train, y_train, cv):
    """
    # default values
    
    LogisticRegression model -> solver='liblinear', random_state=42 
    cross_val_score -> n_jobs=-1, scoring='roc_auc' 
    
    Returns the mean and std of cross-validation scores.
    """
    log_reg = LogisticRegression(solver='liblinear', random_state=42)
    y_pred_cv = cross_val_score(log_reg, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
    return y_pred_cv.mean(), y_pred_cv.std()

def train_predict_proba_log_reg(X_train, y_train, X_test):
    """
    # default values
    
    LogisticRegression model -> solver='liblinear', random_state=42 
    .predict_proba(X_test)[:, 1] # all rows in the second column
    
    Returns the probabilities of each sample belongs to a specific class
    """
    log_reg = LogisticRegression(solver='liblinear', random_state=42)
    log_reg.fit(X_train, y_train)
    y_predict_proba = log_reg.predict_proba(X_test)[:, 1]
    return y_predict_proba

def submission_csv(pred, path_csv, id_col='session_id', target_col='target'):
    submit_df = pd.DataFrame(pred, columns=[target_col], index=np.arange(1, pred.shape[0] + 1))
    submit_df.rename_axis(id_col, inplace=True)
    submit_df.to_csv(path_csv)

**Preparing the data**

In [2]:
DATASET_PATH = 'datasets/alice/'

site_cols = ['site' + str(i) for i in range(1, 11)] # ['site1', 'site2', 'site3', 'site4'...]
time_cols = ['time' + str(i) for i in range(1, 11)]

df_train = pd.read_csv(DATASET_PATH+'train_sessions.csv', index_col='session_id', parse_dates=time_cols)
df_test = pd.read_csv(DATASET_PATH+'test_sessions.csv', index_col='session_id', parse_dates=time_cols)

df_train.sort_values(by='time1', inplace=True)

# Fill missing values
imputer = SimpleImputer(strategy='constant', fill_value=0)
train_imputed = imputer.fit_transform(df_train[site_cols]).astype('int')
test_imputed = imputer.transform(df_test[site_cols]).astype('int')

train_imputed = [" ".join(row.astype('str')) for row in train_imputed]
test_imputed = [" ".join(row.astype('str')) for row in test_imputed]

# Get vocabulary of sites
counter = CountVectorizer(ngram_range=(1, 3), max_features=50000)
X_train = counter.fit_transform(train_imputed)
X_test = counter.transform(test_imputed)

y_train = df_train['target'].astype('int').ravel()

In [8]:
# # Scaling new features - doesn't work very much that not using it
# scaler = StandardScaler()
# X_train_features = scaler.fit_transform(extract_features_from_time(df_train))
# X_test_features = scaler.transform(extract_features_from_time(df_test))

X_train_features = extract_features_from_time(df_train)
X_test_features = extract_features_from_time(df_test)

# Adding new features
X_train_4 = hstack([X_train, X_train_features])
X_test_4 = hstack([X_test, X_test_features])

# Split for correct cross-validation
time_split = TimeSeriesSplit(n_splits=10)

**Building models**

In [9]:
# sin is_
cv_auc_log_4_mean, cv_auc_log_4_std = roc_auc_cv_log_reg(X_train_4, y_train, time_split)
cv_auc_log_4_mean, cv_auc_log_4_std

(0.912838504257361, 0.07848928412473063)

#### Feature selection

In [54]:
# selectk = SelectKBest(chi2, k=40000)
# X_train_5k = selectk.fit_transform(X_train_4, y_train)
# X_test_5k = selectk.transform(X_test_4)

# cv_auc_log_5k_mean, cv_auc_log_5k_std = roc_auc_cv_log_reg(X_train_5k, y_train, time_split)
# cv_auc_log_5k_mean, cv_auc_log_5k_std

(0.9173077343918777, 0.07008873248479301)

In [10]:
log_reg_4 = LogisticRegression(solver='liblinear', random_state=42)
log_reg_4.fit(X_train_4, y_train)

y_pred_prob_log_4 = log_reg_4.predict_proba(X_test_4)[:, 1]

submission_csv(y_pred_prob_log_4, 'datasets/alice/submit4.csv')