In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
#Function to write submission to file

def write_to_submission_file(predicted_labels, out_file, target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns = [target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
train_df = pd.read_csv(r'data/train_sessions.csv', index_col='session_id')
test_df = pd.read_csv(r'data/test_sessions.csv', index_col='session_id')

In [4]:
# Convert the time columns to datetime type from object
times = ['time%s' %i for i in range(1,11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sorting the data by time
train_df.sort_values(by='time1', inplace=True)

train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [5]:
sites = ['site%s' %i for i in range(1,11)]
train_df[sites].fillna(0).astype('int').to_csv('data/train_sessions_text.csv', sep=',', index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('data/test_sessions_text.csv', sep=',', index=None, header=None)

In [6]:
for i in range(1,11):
    train_df['weekday%s' %i] = train_df['time%s' %i].dt.weekday.fillna(-1).astype('int')
    test_df['weekday%s' %i] = test_df['time%s' %i].dt.weekday.fillna(-1).astype('int')

In [13]:
weekdays = ['weekday%s'%i for i in range(1,11)]

In [7]:
%%time
cv = CountVectorizer(ngram_range=range(1,3), max_features=50000)

with open('data/train_sessions_text.csv') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('data/test_sessions_text.csv') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

Wall time: 6.29 s


In [8]:
y_train = train_df['target'].astype('int')

In [16]:
enc = OneHotEncoder(categories = 'auto')
train_weekdays = train_df[weekdays]
test_weekdays = test_df[weekdays]
train_weekdays = enc.fit_transform(train_weekdays)
test_weekdays = enc.transform(test_weekdays)

In [17]:
X_train = csr_matrix(hstack([X_train,train_weekdays]))
X_test = csr_matrix(hstack([X_test,test_weekdays]))

In [18]:
X_train.shape, X_test.shape

((253561, 50079), (82797, 50079))

In [20]:
time_split = TimeSeriesSplit(n_splits=10)

In [21]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

In [22]:
logit = LogisticRegression(C=1, random_state=17)

In [23]:
%%time

cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

Wall time: 40.3 s


In [24]:
cv_scores, cv_scores.mean()

(array([0.76923681, 0.69851522, 0.78019041, 0.97273132, 0.89756911,
        0.93343362, 0.94472595, 0.91673982, 0.75965015, 0.93655315]),
 0.8609345573868346)

In [25]:
logit.fit(X_train, y_train)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [26]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'data/subm2_29_1.csv')

In [27]:
!kaggle competitions submit catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2 -f data/subm2_29_1.csv -m "Time series + BoW + Additional feature - weekdays. CV = 0.8609 roc_auc"

Successfully submitted to Catch Me If You Can



  0%|          | 0.00/2.28M [00:00<?, ?B/s]
  0%|          | 8.00k/2.28M [00:00<02:42, 14.6kB/s]
  4%|3         | 88.0k/2.28M [00:00<01:51, 20.7kB/s]
 14%|#4        | 328k/2.28M [00:00<01:10, 29.3kB/s] 
 30%|###       | 712k/2.28M [00:01<00:39, 41.7kB/s]
 63%|######3   | 1.45M/2.28M [00:01<00:14, 59.5kB/s]
100%|##########| 2.28M/2.28M [00:06<00:00, 367kB/s] 


Tuning hyperparameter C

In [28]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values}, scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [30]:
%%time

logit_grid_searcher.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 10.8min finished


Wall time: 11min 1s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [31]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.8623665907324451, {'C': 0.5994842503189409})

In [32]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred3, 'data/subm3_29_1.csv')

In [None]:
!kaggle competitions submit catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2 -f data/subm3_29_1.csv -m ""