In [1]:
# загрузим библиотеки и установим опции

from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
!ls

catchme.ipynb  site_dic.pkl  test_sessions.csv	train  train_sessions.csv


In [3]:
# загрузим обучающую и тестовую выборки
train_df = pd.read_csv('train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('test_sessions.csv',
                      index_col='session_id')

# приведем колонки time1, ..., time10 к временному формату
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# отсортируем данные по времени
train_df = train_df.sort_values(by='time1')

# приведем колонки site1, ..., site10 к целочисленному формату и заменим пропуски нулями
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

In [4]:
# загрузим словарик сайтов
with open(r"site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# датафрейм словарика сайтов
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'всего сайтов:', sites_dict.shape[0])
sites_dict.head()

всего сайтов: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [5]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22,0


In [6]:
# наша целевая переменная
y_train = train_df['target']

# объединенная таблица исходных данных
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# индекс, по которому будем отделять обучающую выборку от тестовой
idx_split = train_df.shape[0]

In [7]:
# табличка с индексами посещенных сайтов в сессии
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [8]:
sites_count = full_sites.max().max() + 1

In [9]:
sites_counts = [0] * sites_count

In [10]:
x = full_sites.iloc[0].value_counts()

In [11]:
def calc_tf(row):
    vc = row.value_counts()
    vc = vc[vc.index != 0]
    tf = vc / vc.sum()
    return tf

In [12]:
tf = calc_tf(full_sites.iloc[0])

In [93]:
row = full_sites.iloc[0]
vc = row.value_counts()
vc = vc[vc.index != 0]

In [94]:
vc

56    1
55    1
Name: 21669, dtype: int64

In [13]:
def calc_idf(df):
    max_index = df.max().max()
    term_counts = np.zeros(max_index + 1)
    def update_counts(row):
        for i in set(row):
            term_counts[i] += 1
    df.apply(update_counts, axis = 1)
    return np.log(df.shape[0] / term_counts)

In [14]:
def calc_tfidf(df):
    idf = calc_idf(df)
    def calc_tf(row):
        vc = row.value_counts()
        vc = vc[vc.index != 0]
        tf = vc / vc.sum()
        return tf
    indptr = [0]
    indicies = []
    data = []
    def process_row(row):
        tf = calc_tf(row)
        for i in tf.index:
            indicies.append(i)
            data.append(tf[i] * idf[i])
        indptr.append(len(indicies))
    df.apply(process_row, axis = 1)
    return csr_matrix((data, indicies, indptr))

In [15]:
%%time 
tfidf = calc_tfidf(full_sites)

CPU times: user 8min 1s, sys: 432 ms, total: 8min 1s
Wall time: 8min 1s


In [25]:
def get_auc_lr_valid(X, y, C=0.1, seed=17, ratio = 0.9):
    # разделим выборку на обучающую и валидационную
    idx = round(X.shape[0] * ratio)
    # обучение классификатора
    lr = LogisticRegression(C=C, random_state=seed, n_jobs=-1).fit(X[:idx, :], y[:idx])
    # прогноз для валидационной выборки
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # считаем качество
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [26]:
%%time
# выделим из объединенной выборки только обучающую (для которой есть ответы)
X_train = tfidf[:idx_split, 1:]

# считаем метрику на валидационной выборке
print(get_auc_lr_valid(X_train, y_train))

0.916082679376
CPU times: user 4.05 s, sys: 40 ms, total: 4.09 s
Wall time: 1.12 s


In [83]:
# последовательность с индексами
sites_flatten = full_sites.values.flatten()

# искомая матрица
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                                range(0, sites_flatten.shape[0]  + 10, 10)))[:, 1:]

In [89]:
%%time
# выделим из объединенной выборки только обучающую (для которой есть ответы)
X_train1 = full_sites_sparse[:idx_split, :]

# считаем метрику на валидационной выборке
print(get_auc_lr_valid(X_train1, y_train))

0.920893762469
CPU times: user 5.89 s, sys: 96 ms, total: 5.98 s
Wall time: 1.62 s


In [90]:
print(full_sites_sparse[0, :])

  (0, 55)	1
  (0, 54)	1
