In [1]:
import os
import pickle
import eli5
import numpy as np
import pandas as pd
import seaborn as sns
import calmap

from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    TimeSeriesSplit, cross_val_score, GridSearchCV, ParameterGrid
)
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from IPython.display import display_html
from category_encoders import WOEEncoder

In [2]:
PATH_TO_DATA = './data/'
SEED = 17
TIMES = ['time%s' % i for i in range(1, 11)]
SITES = ['site%s' % i for i in range(1, 11)]

In [3]:
with open(os.path.join(PATH_TO_DATA, 'site_dic.pkl'), 'rb') as f:
    site2id = pickle.load(f)
id2site = {v:k for (k, v) in site2id.items()}
id2site[0] = 'unknown'

In [4]:
def grid_search_vectorizer_params(params, train_df, test_df, idx_split, additional_data_df):
    feature_names = list(additional_data_df.columns)
    time_split = TimeSeriesSplit(n_splits=10)
    logit = LogisticRegression(random_state=SEED, solver='liblinear')
    result = []

    for vectorizer_params in ParameterGrid(params):
        X_train, X_test, y_train, vectorizer = prepare_sparse_features(
            train_df, test_df, vectorizer_params=vectorizer_params
        )
        X_train = hstack([X_train, additional_data_df.values[:idx_split,:]])
        X_test = hstack([X_test, additional_data_df.values[idx_split:,:]])
        
        cv_scores = cross_val_score(
            logit,
            X_train,
            y_train,
            cv=time_split, 
            scoring='roc_auc',
            n_jobs=4
        )
        result.append((cv_scores, vectorizer_params))
        print(vectorizer_params)
        print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
        print()

    return result

In [5]:
def _get_sites(sites):
    result = []
    for site_id in sites:
        if site_id == 0:
            continue
        site = id2site[site_id]
        if site.startswith('www.'):
            site = site.replace('www.', '')
        result.append(site)
    return ' '.join(result)

In [6]:
def write_to_submission_file(predicted_labels, out_file='submission.csv',
                             target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(
        predicted_labels,
        index=np.arange(1, predicted_labels.shape[0] + 1),
        columns=[target]
    )
    predicted_df.to_csv(out_file, index_label=index_label)

In [7]:
def get_dataframes(path_to_train, path_to_test):
    train_df = pd.read_csv(
        path_to_train,
        index_col='session_id',
        parse_dates=TIMES
    )
    test_df = pd.read_csv(
        path_to_test,
        index_col='session_id',
        parse_dates=TIMES
    )
    train_df = train_df.sort_values(by='time1')
    
    return train_df, test_df

In [8]:
def prepare_sparse_features(train_df, test_df, vectorizer_params):

    idx_split = train_df.shape[0]
    full_df = pd.concat([train_df.drop('target', axis=1), test_df])

    sessions = full_df[SITES].fillna(0).astype('int').apply(
        lambda row: _get_sites(row), axis=1
    ).tolist()

    vectorizer = TfidfVectorizer(**vectorizer_params)
    vectorized_sessions = vectorizer.fit_transform(sessions)

    X_train = vectorized_sessions[:idx_split,:]
    X_test = vectorized_sessions[idx_split:,:]
    y_train = train_df['target'].astype('int').values

    return X_train, X_test, y_train, vectorizer

In [9]:
def train_and_predict(model, X_train, y_train, X_test, site_feature_names,
                      cv, new_feature_names=None, submission_file_name=None):
    cv_scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=cv, 
        scoring='roc_auc',
        n_jobs=4
    )
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))

    model.fit(X_train, y_train)

    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names
    else: 
        all_feature_names = site_feature_names
        
    if new_feature_names:
        print('New feature weights:')
        print(pd.DataFrame({
            'feature': new_feature_names, 
            'coef': model.coef_.flatten()[-len(new_feature_names):]
        }))

    if submission_file_name:
        test_pred = model.predict_proba(X_test)[:, 1]
        write_to_submission_file(test_pred, submission_file_name) 
        
    display_html(eli5.show_weights(
        estimator=model,
        feature_names=all_feature_names,
        top=30
    ))

    return cv_scores, model

Сформируем с помощью `TfidfVectorizer` разряженные матрицы на основе датасета.

In [10]:
train_df, test_df = get_dataframes(
    path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
)

In [11]:
idx_split = train_df.shape[0]
full_df = pd.concat([train_df.drop('target', axis=1), test_df])
y_train = train_df['target']
additional_data_df = pd.DataFrame(index=full_df.index)

Добавим **время суток** как WOE

In [12]:
start_hour_train_df = pd.DataFrame(index=train_df.index)
start_hour_train_df['start_hour'] = train_df['time1'].dt.hour

start_hour_test_df = pd.DataFrame(index=test_df.index)
start_hour_test_df['start_hour'] = test_df['time1'].dt.hour

woe_enc = WOEEncoder(cols=['start_hour'], random_state=SEED)
woe_enc.fit(start_hour_train_df, y_train)

start_hour_train_df['start_hour'] = woe_enc.transform(start_hour_train_df)
start_hour_test_df['start_hour'] = woe_enc.transform(start_hour_test_df)

start_hour_full_df = pd.concat([start_hour_train_df, start_hour_test_df])
additional_data_df = pd.concat([additional_data_df, start_hour_full_df], axis=1)

Добавим **продолжительность сессии**

In [13]:
additional_data_df['duration'] = (
    full_df[TIMES].max(axis=1) - full_df[TIMES].min(axis=1)
).astype('timedelta64[ms]').astype(int)

scaler = StandardScaler()
additional_data_df['duration'] = scaler.fit_transform(
    additional_data_df['duration'].values.reshape(-1, 1)
)

Добавим **сезон**

In [14]:
additional_data_df['summer'] = (full_df['time1'].dt.month.isin([6, 7, 8])).astype('int')
additional_data_df['autumn'] = (full_df['time1'].dt.month.isin([9, 10, 11])).astype('int')
additional_data_df['winter'] = (full_df['time1'].dt.month.isin([12, 1, 2])).astype('int')
additional_data_df['spring'] = (full_df['time1'].dt.month.isin([3, 4, 5])).astype('int')

Добавим **день недели**

In [15]:
additional_data_df['day_of_week'] = full_df['time1'].apply(
    lambda t: t.weekday()).values.reshape(-1, 1)

Добавим логарифм от числа **год + месяц**

In [16]:
additional_data_df['year_month'] = np.log(full_df['time1'].apply(
    lambda t: 100 * t.year + t.month).values.reshape(-1, 1))

Закончилась ли сессия **просмотром 10 сайтов**?

In [17]:
additional_data_df['site10'] = full_df['site10'].isna().astype('int').values.reshape(-1, 1)

Добавим **номер недели**

In [18]:
n_week_train_df = pd.DataFrame(index=train_df.index)
n_week_train_df['n_week'] = train_df['time1'].dt.week

n_week_test_df = pd.DataFrame(index=test_df.index)
n_week_test_df['n_week'] = test_df['time1'].dt.week

woe_enc = WOEEncoder(cols=['n_week'], random_state=SEED)
woe_enc.fit(n_week_train_df, y_train)

n_week_train_df['n_week'] = woe_enc.transform(n_week_train_df)
n_week_test_df['n_week'] = woe_enc.transform(n_week_test_df)

n_week_full_df = pd.concat([n_week_train_df, n_week_test_df])
additional_data_df = pd.concat([additional_data_df, n_week_full_df], axis=1)

In [19]:
tmp = pd.concat([train_df, additional_data_df[:idx_split]], axis=1)
corrs = tmp.corr()['target'].abs()
corrs[corrs>.0001]

site1          0.001635
site2          0.003593
site3          0.004641
site4          0.003328
site5          0.005701
site6          0.004205
site7          0.002898
site8          0.006590
site9          0.004619
site10         0.006350
target         1.000000
start_hour     0.136587
duration       0.027864
summer         0.009585
autumn         0.051446
winter         0.023724
spring         0.012645
day_of_week    0.041859
year_month     0.034096
site10         0.024135
n_week         0.087268
Name: target, dtype: float64

In [20]:
vectorizer_params = {
    'max_df': 0.25,
    'ngram_range': (1, 6),
    'max_features': 100000,
    'norm': 'l2',
    'binary': True,
    'tokenizer': lambda s: s.split()
}

X_train, X_test, y_train, vectorizer = prepare_sparse_features(
    train_df, test_df, vectorizer_params=vectorizer_params
)

X_train = hstack([X_train, additional_data_df.values[:idx_split,:]])
X_test = hstack([X_test, additional_data_df.values[idx_split:,:]])
feature_names = list(additional_data_df.columns)

In [None]:
time_split = TimeSeriesSplit(n_splits=10)
logit_params = {
    'C': 1.623776739188721,
    'penalty': 'l2',
    'solver': 'liblinear',
    'random_state': SEED
}
logit = LogisticRegression(**logit_params)

cv_score = train_and_predict(
    model=logit,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    site_feature_names=vectorizer.get_feature_names(),
    new_feature_names=feature_names,
    cv=time_split,
    submission_file_name='submission.csv'
);


CV scores [0.86643123 0.91419303 0.95825852 0.95338674 0.95789663 0.98008842
 0.92195744 0.98545972 0.95585084 0.99093732]
CV mean: 0.9484459881980023, CV std: 0.03611501747527895


In [None]:
# parameters = {
#     'C': np.logspace(-4, 4, 20)
# }

# logit_text_clf = GridSearchCV(logit, parameters, cv=time_split, scoring='roc_auc')
# logit_text_clf.fit(X_train, y_train);

In [None]:
alice = train_df[train_df['target'] == 1]

alice_sessions = (
    alice['time1'].dt.round('D')
    .value_counts()
    .sort_index()
)

plt.scatter(alice_sessions.index, alice_sessions.values)
plt.xlabel('day')
plt.ylabel('number of sessions')
plt.rcParams["figure.figsize"] = (20,5);

In [None]:
not_alice = train_df[train_df['target'] == 0]

not_alice_sessions = (
    not_alice['time1'].dt.round('D')
    .value_counts()
    .sort_index()
)

plt.scatter(not_alice_sessions.index, not_alice_sessions.values)
plt.xlabel('month')
plt.ylabel('number of sessions')
plt.rcParams["figure.figsize"] = (20,5);

In [None]:
test_sessions = (
    test_df['time1'].dt.round('D')
    .value_counts()
    .sort_index()
)

plt.scatter(test_sessions.index, test_sessions.values)
plt.xlabel('day')
plt.ylabel('number of sessions');

In [None]:
calmap.yearplot(test_sessions, year=2014);

In [None]:
calmap.yearplot(alice_sessions, year=2013);

In [None]:
calmap.yearplot(not_alice_sessions, year=2013);

In [None]:
calmap.yearplot(alice_sessions, year=2014);

In [None]:
calmap.yearplot(not_alice_sessions, year=2014);