In [1]:
from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
from glob import glob
import re
import os
import pickle
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from scipy import sparse 
from scipy import stats
from statsmodels.stats.proportion import proportion_confint
%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
# Поменяйте на свой путь к данным
PATH_TO_DATA = 'capstone_user_identification'

In [164]:
def prepare_train_set_with_fe(path_to_csv_files, site_freq_path, feature_names,
                                    session_length=10, window_size=10):
    ''' ВАШ КОД ЗДЕСЬ'''
    files = glob(os.path.join(path_to_csv_files, '*.csv'))

    num_filter = r'[0-9]{4}' #Фильтр для user_id

    data_set = pd.DataFrame(columns = feature_names) #Здесь собираем ответ по всем пользователям

    with open(site_freq_path, 'rb') as handle:
        site_freq = pickle.load(handle)
    site_freq_len = len(site_freq)

    for filename in files:
        #Собираем данные по сессиям 
        data = pd.DataFrame(columns = feature_names) #Здесь собираем ответ по каждому пользователю отдельно в нужном формате
        df = pd.read_csv(filename, parse_dates=True) #Сырые данные
        df.timestamp = pd.to_datetime(df.timestamp) #Почему-то дата не подхватывается в read_csv
        user_id = int(re.findall(num_filter, filename)[-1])

        #Заменим названия сайтов на их ID
        df['site'] = df['site'].map(site_freq).apply(lambda x: x[0])
        #Формируем сессию из сайтов с заданым session_length, window_size
        for i in range(1, session_length+1):
            data.loc[:, f'site{i}'] = pd.Series(df.iloc[i-1::window_size, 1].values.tolist())
        #Формируем разницу между посещениями различных сайтов
        for i in range(1, session_length):        
            data.loc[:, f'time_diff{i}'] = ((df.iloc[i::window_size, 0].reset_index() - 
                                             df.iloc[i-1::window_size, 0].reset_index())['timestamp'] / 
                                             np.timedelta64(1,'s'))
        # Запишем 
        data['target'] = user_id
        data['start_hour'] = df.iloc[::window_size, 0].dt.hour.reset_index()['timestamp']
        data['day_of_week'] = df.iloc[::window_size, 0].dt.dayofweek.reset_index()['timestamp']

        data_set = data_set.append(data, ignore_index=True)
    
    data_set['#unique_sites'] = data_set.loc[:, 'site1':'site10'].nunique(axis=1)
    data_set['session_timespan'] = data_set.loc[:,'time_diff1':'time_diff9'].sum(axis=1)
    data_set.fillna(0, inplace=True)
    data_set = data_set.astype('int')
    return data_set
    


In [167]:
feature_names = ['site' + str(i) for i in range(1,11)] + \
                ['time_diff' + str(j) for j in range(1,10)] + \
                ['session_timespan', '#unique_sites', 'start_hour', 
                 'day_of_week', 'target']
train_data_toy  = prepare_train_set_with_fe(os.path.join(PATH_TO_DATA, '10users'), 
                  site_freq_path=os.path.join(PATH_TO_DATA, 'site_freq_10users.pkl'),
                  feature_names=feature_names, session_length=10)

In [171]:
train_data_toy.loc[train_data_toy.session_timespan == 0, 'time_diff1':'time_diff9']

Unnamed: 0,time_diff1,time_diff2,time_diff3,time_diff4,time_diff5,time_diff6,time_diff7,time_diff8,time_diff9
549,0,0,0,0,0,0,0,0,0
551,0,0,0,0,0,0,0,0,0
635,0,0,0,0,0,0,0,0,0
891,0,0,0,0,0,0,0,0,0
892,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
13235,0,0,0,0,0,0,0,0,0
13260,0,0,0,0,0,0,0,0,0
13854,0,0,0,0,0,0,0,0,0
13917,0,0,0,0,0,0,0,0,0


In [162]:
session_length=10
window_size=10
path_to_csv_files=os.path.join(PATH_TO_DATA,'3users')
site_freq_path=os.path.join(PATH_TO_DATA,'site_freq_3users.pkl')
feature_names=['site' + str(i) for i in range(1,11)] + \
              ['time_diff' + str(j) for j in range(1,10)] + \
              ['session_timespan', '#unique_sites', 'start_hour', 
               'day_of_week', 'target']


files = glob(os.path.join(path_to_csv_files, '*.csv'))
    
num_filter = r'[0-9]{4}' #Фильтр для user_id

data_set = pd.DataFrame(columns = feature_names) #Здесь собираем ответ по всем пользователям

with open(site_freq_path, 'rb') as handle:
    site_freq = pickle.load(handle)
site_freq_len = len(site_freq)

for filename in files:
    #Собираем данные по сессиям 
    data = pd.DataFrame(columns = feature_names) #Здесь собираем ответ по каждому пользователю
    df = pd.read_csv(filename, parse_dates=True)
    df.timestamp = pd.to_datetime(df.timestamp)
    user_id = int(re.findall(num_filter, filename)[-1])

    #Заменим названия сайтов на их ID
    df['site'] = df['site'].map(site_freq).apply(lambda x: x[0])

    for i in range(1, session_length+1):
        #Выбираем session_length данных и конвертируем в list, даже если столько данных нет - работает
        data.loc[:, f'site{i}'] = pd.Series(df.iloc[i-1::window_size, 1].values.tolist())
    for i in range(1, session_length):        
        data.loc[:, f'time_diff{i}'] = ((df.iloc[i::window_size, 0].reset_index() - 
                                         df.iloc[i-1::window_size, 0].reset_index())['timestamp'] / 
                                         np.timedelta64(1,'s'))
        
    data.target = user_id
    data['start_hour'] = df.iloc[::window_size, 0].dt.hour.reset_index()['timestamp']
    data['day_of_week'] = df.iloc[::window_size, 0].dt.dayofweek.reset_index()['timestamp']
    
    data_set = data_set.append(data, ignore_index=True)
    data_set['#unique_sites'] = data_set.loc[:, 'site1':'site10'].nunique(axis=1)
    data_set['session_timespan'] = data_set.loc[:,'time_diff1':'time_diff9'].sum(axis=1)
    data_set.fillna(0, inplace=True)
    data_set = data_set.astype('int')
    
        

In [61]:
df.timestamp = pd.to_datetime(df.timestamp)

In [116]:
((df.iloc[6:11, 0].reset_index() - df.iloc[0:5, 0].reset_index())['timestamp'] / np.timedelta64(1, 's')).astype('int')

0    7938
1    7653
2    6472
3     249
4    3603
Name: timestamp, dtype: int32

In [159]:
df.iloc[::window_size, 0].dt.hour

0      9
10    12
Name: timestamp, dtype: int64

In [94]:
td = (df.iloc[1:5, 0] - df.iloc[6:10, 0]).astype('timedelta64[ns]')

In [137]:
data_set['#unique_sites'] = data_set.iloc[:, :10].nunique(axis=1)

In [163]:
data_set

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,time_diff5,time_diff6,time_diff7,time_diff8,time_diff9,session_timespan,#unique_sites,start_hour,day_of_week,target
0,10,8,8,4,8,5,1,6,2,9,...,2,1,2,3,55,7998,8,9,4,1
1,10,5,5,5,0,0,0,0,0,0,...,0,0,0,0,0,60,3,12,4,1
2,10,8,3,3,8,0,0,0,0,0,...,0,0,0,0,0,7935,4,9,4,2
3,7,5,8,5,8,5,5,6,11,7,...,2,1,2,3,55,7998,5,9,4,3
4,7,5,8,0,0,0,0,0,0,0,...,0,0,0,0,0,1471,3,12,4,3
