In [None]:
import numpy as np 
import pandas as pd 
import os
import seaborn as sns

import matplotlib.pyplot as plt
import sklearn as sk
import scipy as sc
import catboost as cb
import pickle as pic


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostClassifier

In [None]:
train_df = pd.read_csv('../input/catch-me-if-you-can/train_sessions.csv', index_col = 'session_id')
test_df = pd.read_csv('../input/catch-me-if-you-can/test_sessions.csv', index_col = 'session_id')

In [None]:
train_df.head(5)

# **Отбор факторов для анализа**

In [None]:
features = pd.DataFrame()

timepoints = train_df[['time%s' % i for i in range(1, 11)]]
sites = train_df[['site%s' % i for i in range(1, 11)]].fillna(0).astype(int).values

for td_index in range(1, 10):
    features['target'] = train_df['target']
    features['time_diff{}'.format(td_index)] = (pd.to_datetime(timepoints['time{}'.format(td_index + 1)]) - 
                        pd.to_datetime(timepoints['time{}'.format(td_index)])).dt.total_seconds().fillna(0)
    
features['time_of_session'] = np.sum(features, axis=1)
features['hour'] = pd.to_datetime(timepoints['time1']).dt.hour
features['day_of_week'] = pd.to_datetime(timepoints['time1']).dt.weekday
features['month'] = pd.to_datetime(timepoints['time1']).dt.month
features['unique_sites'] = [len(np.unique(session[session != 0])) for session in sites]

In [None]:
features.head(5)

# **Визуальный анализ данных**

In [None]:
g = sns.FacetGrid(features, col='target')
g.map(sns.distplot, 'time_of_session', color='purple')
plt.xlim(-50, 400)

In [None]:
plt.figure(figsize=(7, 5))
sns.countplot(x=features['hour'], data=features, hue='target', palette='inferno')
plt.ylabel('Кол-во сессий')
plt.xlabel('Время начала')
plt.title('Распределение сессий по часу начала')

plt.figure(figsize=(7, 5))
plt.ylim(0, 1100)
sns.countplot(x=features['hour'], data=features, hue='target', palette='inferno')
plt.ylabel('Кол-во сессий')
plt.xlabel('Время начала')
plt.title('Срез при target=1')

Распределение сессиий у Alice и других пользователей отличается по времени начала. Основная активность у Alice происходит с 12 до 13 часов дня и с 16 до 18 часов.

In [None]:
plt.figure(figsize=(7, 5))
sns.countplot(x=features['day_of_week'], data=features, hue='target', palette='inferno')
plt.ylabel('Кол-во сессий')
plt.xlabel('День недели')
plt.grid()
plt.title('Распределение посещений по дням недели')

plt.figure(figsize=(7, 5))
plt.ylim(0, 1000)
sns.countplot(x=features['day_of_week'], data=features, hue='target', palette='inferno')
plt.ylabel('Кол-во сессий')
plt.xlabel('День недели')
plt.grid()
plt.title('Срез при target=1')

Распределение посещений сайтов в зависимости от дня недели различно у Alice и других пользователей. Но наименьшая активность у всех наблюдается на выходных

In [None]:
plt.figure(figsize=(7, 5))
sns.countplot(x=features['month'], data=features, hue='target', palette='inferno')
plt.ylabel('Кол-во сессий')
plt.xlabel('Месяц')
plt.title('Распределение посещений по месяцам')

plt.figure(figsize=(7, 5))
plt.ylim(0, 600)
sns.countplot(x=features['month'], data=features, hue='target', palette='inferno')
plt.ylabel('Кол-во сессий')
plt.xlabel('Месяц')
plt.title('Срез при target=1')

В основном распределение посещений сайтов Alice по месяцам совпадает с другими пользователями. Наименьшая (почти нулевая) активность у Alice наблюдается с мая по август

In [None]:
plt.figure(figsize=(7, 5))
sns.countplot(x=features['unique_sites'], data=features, hue='target', palette='inferno')
plt.ylabel('Кол-во посещений')
plt.xlabel('Сайт')
plt.title('График посещений уникального сайта')

plt.figure(figsize=(7, 5))
plt.ylim(0, 600) 
sns.countplot(x=features['unique_sites'], data=features, hue='target', palette='inferno')
plt.ylabel('Кол-во посещений')
plt.xlabel('Сайт')
plt.title('Срез при target=1') 

Alice посещает те же сайты, что и остальные пользователи

Таким образом, в результате визуального анализа данных были выбраны следующие факторы для анализа:
    
    1. Час начала сессии
    2. День недели
    3. Месяц

In [None]:
test = pd.DataFrame()

timepoints = test_df[['time%s' % i for i in range(1, 11)]]
sites = test_df[['site%s' % i for i in range(1, 11)]].fillna(0).astype(int).values

for td_index in range(1, 10):
    test['time_diff{}'.format(td_index)] = (pd.to_datetime(timepoints['time{}'.format(td_index + 1)]) - 
                        pd.to_datetime(timepoints['time{}'.format(td_index)])).dt.total_seconds().fillna(0)
   
test['time_of_session'] = np.sum(test, axis=1)
test['hour'] = pd.to_datetime(timepoints['time1']).dt.hour
test['day_of_week'] = pd.to_datetime(timepoints['time1']).dt.weekday
test['month'] = pd.to_datetime(timepoints['time1']).dt.month
test['unique_sites'] = [len(np.unique(session[session != 0])) for session in sites]

In [None]:
features['target_hour'] = np.where(((features['hour']>=12) & (features['hour']<=13)) | ((features['hour']>=16) & (features['hour']<=18)), 0, 1)
test['target_hour'] = np.where(((test['hour']>=12) & (test['hour']<=13)) | ((test['hour']>=16) & (test['hour']<=18)), 1, 0)

features['target_week_day'] = np.where((features['day_of_week']==5) | (features['day_of_week']==6), 1, 0)
test['target_week_day'] = np.where((test['day_of_week']==5) | (test['day_of_week']==6), 1, 0)

features['target_month'] = np.where(((features['month']>=5) & (features['month']<=8)), 1, 0)
test['target_month'] = np.where(((test['month']>=5) & (test['month']<=8)), 1, 0)

target_train = features[['target_hour', 'target_week_day','target_month']]
target_test = test[['target_hour', 'target_week_day','target_month']]

In [None]:
Y_train = features['target'].values

In [None]:
sites = ['site%s' % i for i in range(1,11)]
train_df[sites] = train_df[sites].fillna(0).astype(int)
test_df[sites] = test_df[sites].fillna(0).astype(int)

In [None]:
train_df[sites].to_csv('train_sessions_text.txt', 
                                 sep=' ', index=None, header=None)
test_df[sites].to_csv('test_sessions_text.txt', 
                                sep=' ', index=None, header=None)

# **Процесс построения логистической регрессии**

In [None]:
cv = CountVectorizer(ngram_range=(1, 1), max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
print(X_train.shape, X_test.shape)

In [None]:
X_train_log, X_valid_log, Y_train_log, Y_valid_log = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)
log_reg = LogisticRegression(C=1.0, random_state=42, solver='lbfgs', max_iter=500)
log_reg.fit(X_train_log, Y_train_log)

In [None]:
y_pred = log_reg.predict_proba(X_valid_log)
score_log = roc_auc_score(Y_valid_log, y_pred[:,1])
print("log",round(score_log, 2)*100)

In [None]:
log_reg.fit(X_train, Y_train)

In [None]:
Y_test = log_reg.predict_proba(X_test)
Y_test[:5]

# **Процесс построения алгоритма Random Forest**

In [None]:
random_forest_clf=RandomForestClassifier(n_estimators = 500,max_depth = 20 ,random_state = 42)
random_forest_clf.fit(X_train_log,Y_train_log)

# **Алгоритм градиентного бустинга (catboost)**

In [None]:
SEED = 1
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.25, random_state=SEED)
params = {'loss_function':'Logloss',
          'eval_metric':'AUC', 
          'verbose': 200,
          'random_seed': SEED
         }
cbc_1 = CatBoostClassifier(**params)
cbc_1.fit(X_train, Y_train, 
          eval_set=(X_valid, Y_valid), 
          use_best_model=True, 
          plot=True 
         );

In [None]:
pred = cbc_1.predict(X_test)
pred

Кросс-валидация

In [None]:
my_pipeline = make_pipeline(SimpleImputer(), RandomForestRegressor())

In [None]:
#scores = cross_val_score(my_pipeline, X_train, Y_train, scoring='neg_mean_absolute_error')
#print(scores)