In [21]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder
import eli5
import pickle

In [2]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns = [target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
train_df = pd.read_csv(r'data/train_sessions.csv', index_col='session_id')
test_df = pd.read_csv(r'data/test_sessions.csv', index_col='session_id')

In [4]:
# Convert the time columns to datetime type from object
times = ['time%s' %i for i in range(1,11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sorting the data by time
train_df.sort_values(by='time1', inplace=True)

train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [5]:
sites = ['site%s' %i for i in range(1,11)]
train_df[sites].fillna(0).astype('int').to_csv('data/train_sessions_text.csv', sep=',', index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('data/test_sessions_text.csv', sep=',', index=None, header=None)

In [34]:
%%time
cv = CountVectorizer(max_features=50000)

with open('data/train_sessions_text.csv') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('data/test_sessions_text.csv') as inp_test_file:
    X_test = cv.transform(inp_test_file)

Wall time: 6.86 s


In [35]:
X_train.shape, X_test.shape

((253561, 41592), (82797, 41592))

In [36]:
y_train = train_df['target'].astype('int')

In [37]:
time_split = TimeSeriesSplit(n_splits=10)

In [38]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

In [39]:
logit = LogisticRegression(C=1, random_state=17)

In [40]:
%%time

cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

Wall time: 50.8 s


In [41]:
cv_scores, cv_scores.mean()

(array([0.84222136, 0.66762837, 0.87016601, 0.9456678 , 0.83148685,
        0.87731504, 0.92626132, 0.85854648, 0.92573906, 0.91166498]),
 0.8656697278614922)

In [42]:
logit.fit(X_train, y_train)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [63]:
eli5.show_weights(logit, feature_names=feature_names)

Weight?,Feature
+5.929,lachroniquefaciledumercredi.wordpress.com
+3.755,juicing-fr.mo7e.com
+3.158,www.jle.com
+2.992,www.circuit-electronique.fr
+2.896,ws.m6.fr
+2.880,paiement.societe.com
+2.781,www.wikidata.org
+2.709,a291862.wc.s1.b.yahoodns.net
+2.631,www.footdelles.com
+2.582,elgateado.free.fr


In [22]:
with open(r"data/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

In [27]:
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])

In [33]:
sites_dict.iloc[570]

site    bricolage.linternaute.com
Name: 25111, dtype: object

In [61]:
feature_names = list(pd.Series(cv.get_feature_names()).astype('int').apply(lambda site: sites_dict.iloc[site]).values.flatten())

In [62]:
feature_names

['images.mystockphoto.com',
 'cdnetworks.static69.com',
 'i1-js-14-3-01-11074-431860008-i.init.cedexis-radar.net',
 'i1-js-14-3-01-11074-355046712-i.init.cedexis-radar.net',
 'www.alfa147-france.net',
 'pegasus.portal.nom.br',
 'store.akamai.steamstatic.com',
 'i1-js-14-3-01-11074-928034736-i.init.cedexis-radar.net',
 'i1-js-14-3-01-11074-953586925-i.init.cedexis-radar.net',
 'forum.hackbbs.org',
 's01.static-shell.com',
 'le54.blogspot.fr',
 'mono-project.com',
 'images-cache.frandroid.com',
 'i1-js-14-3-01-12160-55914827-i.init.cedexis-radar.net',
 'i1-js-14-3-01-10322-82158527-i.init.cedexis-radar.net',
 'oeconomia.net',
 'secure.logmein.com',
 'img257.imageshack.us',
 'www.vigicrues.ecologie.gouv.fr',
 'www.estudines.com',
 'aepdaks3.aetndigital.com',
 'colvibsvt.space-blogs.net',
 'i1-js-14-3-01-12160-712066075-i.init.cedexis-radar.net',
 'www.sqlservercentral.com',
 'video.foxnews.com',
 'trustlogo.comodo.com',
 'i1-js-14-3-01-11074-842307118-i.init.cedexis-radar.net',
 'tankafai