In [131]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder

In [2]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns = [target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [68]:
train_df = pd.read_csv(r'data/train_sessions.csv', index_col='session_id')
test_df = pd.read_csv(r'data/test_sessions.csv', index_col='session_id')

1. Only count vectorizer with ngram = 1
2. Tf-idf
3. Model only using the timestamp values.

In [69]:
train_df.sort_values(by='time1', inplace=True)

In [21]:
sites = ['site%s'%i for i in range(1,11)]

In [70]:
train_df[sites].fillna(0).astype('int').to_csv('data/train_sessions_text.csv', sep=' ', header=False, index=False)
test_df[sites].fillna(0).astype('int').to_csv('data/test_sessions_text.csv', sep=' ', header=False, index=False)

In [35]:
cv = CountVectorizer()

with open('data/train_sessions_text.csv') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('data/test_sessions_text.csv') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

((253561, 41592), (82797, 41592))

In [24]:
y_train = train_df['target']

In [25]:
time_split = TimeSeriesSplit(n_splits=10)

In [26]:
logit = LogisticRegression(C=1, random_state=17)

In [27]:
%%time

cv_scores = cross_val_score(logit, X_train, y_train, scoring='roc_auc', cv=time_split, verbose=True, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   10.7s remaining:    7.1s


Wall time: 15.2 s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.0s finished


In [29]:
cv_scores, cv_scores.mean()

(array([0.84222136, 0.66762837, 0.87016601, 0.9456678 , 0.83148685,
        0.87731504, 0.92626132, 0.85854648, 0.92573906, 0.91166498]),
 0.8656697278614922)

CV Scores not high, countvectorizer with ngrams = range(1,3) yields 91.72 as mean roc_auc cross validation score

Now will train the model with tfidf

In [33]:
tfidf = TfidfTransformer()

In [46]:
X_train_tfidf = tfidf.fit_transform(X_train, y_train)
X_test_tfidf = tfidf.transform(X_test)

In [47]:
%%time

cv_scores = cross_val_score(logit, X_train_tfidf, y_train, scoring='roc_auc', cv=time_split, verbose=True, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.8s remaining:    1.8s


Wall time: 3.77 s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.7s finished


In [48]:
cv_scores, cv_scores.mean()

(array([0.79738066, 0.6652792 , 0.87207189, 0.93336481, 0.84811885,
        0.88257794, 0.92296525, 0.86409586, 0.92640816, 0.91885051]),
 0.8631113139982632)

Trying tfidf with Count Vectorizer of ngrams = range(1,3)

In [49]:
cv = CountVectorizer(ngram_range=range(1,3),max_features=50000)

with open('data/train_sessions_text.csv') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('data/test_sessions_text.csv') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

((253561, 50000), (82797, 50000))

In [50]:
X_train_tfidf = tfidf.fit_transform(X_train, y_train)
X_test_tfidf = tfidf.transform(X_test)

In [51]:
%%time

cv_scores = cross_val_score(logit, X_train_tfidf, y_train, scoring='roc_auc', cv=time_split, verbose=True, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.9s remaining:    2.6s


Wall time: 5.33 s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.2s finished


In [52]:
cv_scores, cv_scores.mean()

(array([0.80292904, 0.66086778, 0.8746486 , 0.93779145, 0.84638388,
        0.88868909, 0.92444036, 0.87133462, 0.93035608, 0.91981705]),
 0.8657257954425998)

Add time feature in tfidf, first the day of week feature

In [59]:
times = ['time%s'%i for i in range(1,11)]

In [71]:
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

In [97]:
total_minutes_train = train_df['time1'].dt.hour * 60 + train_df['time1'].dt.minute
total_minutes_test = test_df['time1'].dt.hour * 60 + test_df['time1'].dt.minute

In [98]:
total_minutes_train = (total_minutes_train/1440)*2*np.pi
total_minutes_test = (total_minutes_test/1440)*2*np.pi

In [99]:
sin_minutes_train, cos_minutes_train = np.sin(total_minutes_train), np.cos(total_minutes_train)
sin_minutes_test, cos_minutes_test = np.sin(total_minutes_test), np.cos(total_minutes_test)

In [115]:
X_train_time1 = hstack([X_train_tfidf,sin_minutes_train.values.reshape(-1,1),cos_minutes_train.values.reshape(-1,1)])

In [117]:
X_test_time1= hstack([X_test_tfidf,sin_minutes_test.values.reshape(-1,1),cos_minutes_test.values.reshape(-1,1)])

In [119]:
%%time

cv_scores = cross_val_score(logit, X_train_time1, y_train, scoring='roc_auc', cv=time_split, verbose=True, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   19.5s remaining:   13.0s


Wall time: 21.7 s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   21.4s finished


In [120]:
cv_scores, cv_scores.mean()

(array([0.7753299 , 0.72000958, 0.95912449, 0.96937823, 0.85327774,
        0.97131059, 0.94231462, 0.92059261, 0.97351572, 0.96881637]),
 0.9053669855586902)

In [125]:
logit.fit(X_train_time1, y_train)
logit_test_pred = logit.predict_proba(X_test_time1)[:,1]



In [127]:
write_to_submission_file(logit_test_pred, 'data/subm1_31_1.csv')

In [128]:
!kaggle competitions submit catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2 -f data/subm1_31_1.csv -m "Websites with tfidf and cyclic time feature. mean CV = 0.905 roc auc"

Successfully submitted to Catch Me If You Can



  0%|          | 0.00/2.25M [00:00<?, ?B/s]
  0%|          | 8.00k/2.25M [00:00<00:33, 69.3kB/s]
 13%|#2        | 296k/2.25M [00:00<00:21, 97.8kB/s] 
 19%|#9        | 440k/2.25M [00:00<00:14, 135kB/s] 
 24%|##4       | 560k/2.25M [00:00<00:09, 184kB/s]
100%|##########| 2.25M/2.25M [00:05<00:00, 427kB/s]


In [145]:
dow_train = train_df['time1'].dt.dayofweek
dow_test = test_df['time1'].dt.dayofweek

In [146]:
enc = OneHotEncoder()

In [147]:
dow_train = enc.fit_transform(dow_train.to_frame())
dow_test = enc.transform(dow_test.to_frame())

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [153]:
X_train_time2 = hstack([X_train_time1, dow_train])
X_test_time2 = hstack([X_test_time1, dow_test])

In [154]:
%%time

cv_scores = cross_val_score(logit, X_train_time2, y_train, scoring='roc_auc', cv=time_split, verbose=True, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   22.6s remaining:   15.0s


Wall time: 24.8 s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   24.6s finished


In [155]:
cv_scores, cv_scores.mean()

(array([0.73549047, 0.81116268, 0.88024848, 0.97711997, 0.87891314,
        0.98041425, 0.91955254, 0.9265961 , 0.76896902, 0.97553197]),
 0.885399860660762)

In [156]:
logit.fit(X_train_time2, y_train)
logit_test_pred = logit.predict_proba(X_test_time2)[:,1]



In [157]:
write_to_submission_file(logit_test_pred, 'data/subm2_31_1.csv')

In [158]:
!kaggle competitions submit catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2 -f data/subm2_31_1.csv -m "Websites with tfidf, cyclic time and dayofweek feature. mean CV = 0.8854 roc"

Successfully submitted to Catch Me If You Can



  0%|          | 0.00/2.26M [00:00<?, ?B/s]
  0%|          | 8.00k/2.26M [00:00<00:33, 70.0kB/s]
 13%|#2        | 296k/2.26M [00:00<00:20, 98.6kB/s] 
 19%|#8        | 440k/2.26M [00:00<00:14, 136kB/s] 
 24%|##4       | 560k/2.26M [00:00<00:09, 183kB/s]
 93%|#########3| 2.11M/2.26M [00:00<00:00, 260kB/s]
100%|##########| 2.26M/2.26M [00:05<00:00, 404kB/s]


In [159]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values}, scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [160]:
%%time

logit_grid_searcher.fit(X_train_time2, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished


Wall time: 2min 5s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [161]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.8887564673543981, {'C': 4.6415888336127775})

In [163]:
logit_test_pred = logit_grid_searcher.predict_proba(X_test_time2)[:, 1]
write_to_submission_file(logit_test_pred, 'data/subm3_31_1.csv')

In [165]:
!kaggle competitions submit catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2 -f data/subm3_31_1.csv -m "Website tfidf. time cycle and dow sparse. C optimized. time series mean cv = 0.8887"

Successfully submitted to Catch Me If You Can



  0%|          | 0.00/2.28M [00:00<?, ?B/s]
  0%|          | 8.00k/2.28M [00:00<00:41, 57.7kB/s]
 13%|#2        | 296k/2.28M [00:00<00:25, 81.6kB/s] 
 21%|##1       | 496k/2.28M [00:00<00:16, 114kB/s] 
100%|##########| 2.28M/2.28M [00:05<00:00, 452kB/s]
