In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [7]:
from pyts.transformation import ROCKET

In [1]:
from parallel_pandas import ParallelPandas

In [8]:
from tsfresh import extract_features

In [9]:
import gc
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import pyarrow as pa
from pyarrow.compute import count_distinct
import scipy
import implicit
import bisect
import functools as ft
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
# from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

import requests
import json

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from tqdm import tqdm

%matplotlib inline
plt.rcParams["figure.figsize"] = 14, 9
sns.set(font_scale=1.2)
sns.set_style('darkgrid')

GET_A_PART_OF_DATA = False
DATA_PATH = './context_data/competition_data_final_pqt'
TARGET_PATH = './context_data/public_train.pqt'

In [10]:
def read_pqt(file, columns):
    return pq.read_table(file).select(columns).to_pandas()

def get_target(columns):
    return read_pqt(TARGET_PATH, columns)

def get_data(columns):
    return read_pqt(DATA_PATH, columns)


### Доли мужских сайтов

In [6]:
data = get_data(columns=['user_id', 'url_host'])
if GET_A_PART_OF_DATA:
    data = data[0:1000000]
data.drop_duplicates(inplace=True)

In [7]:
gc.collect()

24

In [8]:
# Из датафрейма удалются сайты, у которых было менее 20 уникальных пользователей

sites = data.groupby('url_host', as_index=False).agg(unique_user_id=('user_id', 'nunique'))

drop_sites_20 = list(sites.loc[sites.unique_user_id<=20].url_host.unique())

data = data.loc[~data['url_host'].isin(drop_sites_20)]

In [9]:
del sites
gc.collect()

0

In [10]:
# Подгружаются таргеты по полу
# Для каждого сайта считается число людей, число людей мужчин
# и доля от этих двух агрегатов

target = get_target(columns=['user_id','is_male'])
target['user_id'] = target['user_id'].astype('int32')
target['is_male'] = target['is_male'].astype('object')

data = data.merge(target[['is_male','user_id']], on = 'user_id', how = 'inner')
data = data.loc[~(data['is_male'].isna()) & (data['is_male'] != 'NA')]
data['is_male'] = data['is_male'].astype('int8')

df = data.groupby('url_host', as_index=False).agg({'user_id':'nunique','is_male':'sum'})
df['male_fraction'] = df['is_male'] / df['user_id']

df.loc[df.user_id!=0]

Unnamed: 0,url_host,user_id,is_male,male_fraction
0,-1,1780,1471.0,0.826404
1,0-1.ru,19,16.0,0.842105
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,2891,1688.0,0.583881
3,003ms.ru,266,118.0,0.443609
4,010203.org,49,23.0,0.469388
...,...,...,...,...
31079,zynzyn.ru,138,52.0,0.376812
31080,zz-shop.ru,17,10.0,0.588235
31081,zznaki.ru,59,15.0,0.254237
31082,zzz.fm,32,16.0,0.500000


In [11]:
df.to_csv('sites_male_fraction.csv', index=False)
gc.collect()

0

In [12]:
del data, target

### Медианы возрастов

In [13]:
data = get_data(columns=['user_id', 'url_host'])
if GET_A_PART_OF_DATA:
    data = data[0:1000000]
data.drop_duplicates(inplace=True)

In [14]:
gc.collect()

0

In [15]:
data = data.loc[~data['url_host'].isin(drop_sites_20)]
gc.collect()

0

In [16]:
target = get_target(columns=['user_id','age'])
target['user_id'] = target['user_id'].astype('int32')

# добавляем target, удаляем nan
data = data.merge(target[['age','user_id']], on = 'user_id', how = 'inner')
data = data.loc[~(data['age'].isna()) & (data['age'] > 18) & (data['age'] != 'NA')]
data['age'] = data['age'].astype('int16')

df = data.groupby('url_host', as_index=False).agg(median_age=('age', 'median'))

df.loc[~df.median_age.isna()]

Unnamed: 0,url_host,median_age
0,-1,30.0
1,0-1.ru,34.0
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,30.0
3,003ms.ru,40.0
4,010203.org,36.0
...,...,...
31079,zynzyn.ru,34.5
31080,zz-shop.ru,36.0
31081,zznaki.ru,30.5
31082,zzz.fm,36.5


In [17]:
df.to_csv('sites_age.csv', index=False)
gc.collect()

0

### Генерация признаков Rocketdata

In [18]:
data = get_data(columns=['user_id', 'url_host', 'date', 'part_of_day'])
if GET_A_PART_OF_DATA:
    data = data[0:1000000]
# data.drop_duplicates(inplace=True)

In [19]:
sites_pop = data.groupby('url_host', as_index=False)[['user_id']].nunique()
sites_pop.columns = ['url_host', 'user_id_count']
unique_users = list(data.user_id.unique())
data.part_of_day = data.part_of_day.replace({'night': 0, 'morning': 1, 'day': 2, 'evening': 3})
data.date = pd.to_datetime(data.date)
data = data.merge(sites_pop, how='left')
data

Unnamed: 0,user_id,url_host,date,part_of_day,user_id_count
0,45098,ad.adriver.ru,2022-06-15,1,291568
1,45098,apple.com,2022-06-19,1,143639
2,45098,avatars.mds.yandex.net,2022-06-12,2,382692
3,45098,googleads.g.doubleclick.net,2022-05-16,2,394562
4,45098,googleads.g.doubleclick.net,2022-05-30,2,394562
...,...,...,...,...,...
322899430,300964,avatars.mds.yandex.net,2021-07-12,1,382692
322899431,300964,googleads.g.doubleclick.net,2021-06-20,3,394562
322899432,300964,online.sberbank.ru,2021-08-05,2,315435
322899433,300964,s0.2mdn.net,2021-07-19,3,298124


In [20]:
# data = pa.Table.from_pandas(data)

In [21]:
male_fraction = pd.read_csv('sites_male_fraction.csv').fillna(0)
male_fraction = male_fraction.loc[male_fraction.male_fraction != 0]
male_fraction['male_fraction'] = male_fraction['male_fraction'] - 0.5
male_fraction = male_fraction[['url_host', 'male_fraction']]
male_fraction

Unnamed: 0,url_host,male_fraction
0,-1,0.326404
1,0-1.ru,0.342105
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,0.083881
3,003ms.ru,-0.056391
4,010203.org,-0.030612
...,...,...
31079,zynzyn.ru,-0.123188
31080,zz-shop.ru,0.088235
31081,zznaki.ru,-0.245763
31082,zzz.fm,0.000000


In [22]:
median_age = pd.read_csv('sites_age.csv').fillna(0)
median_age = median_age.loc[median_age.median_age != 0]
median_age['median_age'] = median_age['median_age'] - 18
median_age = median_age[['url_host', 'median_age']]
median_age

Unnamed: 0,url_host,median_age
0,-1,12.0
1,0-1.ru,16.0
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,12.0
3,003ms.ru,22.0
4,010203.org,18.0
...,...,...
31079,zynzyn.ru,16.5
31080,zz-shop.ru,18.0
31081,zznaki.ru,12.5
31082,zzz.fm,18.5


In [23]:
target = median_age.merge(male_fraction, how='left')
target = target.loc[~target.male_fraction.isna()]
target['target'] = target['median_age'] * target['male_fraction']
target = target[['url_host', 'target']]
target

Unnamed: 0,url_host,target
0,-1,3.916854
1,0-1.ru,5.473684
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,1.006572
3,003ms.ru,-1.240602
4,010203.org,-0.551020
...,...,...
31079,zynzyn.ru,-2.032609
31080,zz-shop.ru,1.588235
31081,zznaki.ru,-3.072034
31082,zzz.fm,0.000000


In [24]:
data = data.sort_values(by=['date', 'part_of_day', 'user_id_count'])
data = data.merge(target, how='left')
data['target'] = data['target'].fillna(0)
# data = data[['user_id', 'target']]
data

Unnamed: 0,user_id,url_host,date,part_of_day,user_id_count,target
0,271027,client.tamarix-group.ru,2021-06-16,0,1,0.000000
1,271027,tamarix-group.ru,2021-06-16,0,1,0.000000
2,100795,zahar-don.livejournal.com,2021-06-16,0,1,0.000000
3,93981,g_4.edu54.ru,2021-06-16,0,1,0.000000
4,317954,gastrogid.ru,2021-06-16,0,1,0.000000
...,...,...,...,...,...,...
322899430,406073,yandex.ru,2022-11-01,3,386405,0.265339
322899431,220333,googleads.g.doubleclick.net,2022-11-01,3,394562,0.213959
322899432,28719,googleads.g.doubleclick.net,2022-11-01,3,394562,0.213959
322899433,330810,googleads.g.doubleclick.net,2022-11-01,3,394562,0.213959


In [25]:
df_rows = pd.DataFrame(columns = list(range(500)) + ['user_id'])

groups = data.groupby("user_id")
 
for user_id, group in tqdm(groups):
    group = group[-500:].reset_index()
    row = group[['target']].T
    row['user_id'] = user_id
    df_rows = df_rows.append(row)
    
df_rows = df_rows.fillna(0)

100%|██████████| 415317/415317 [23:31:56<00:00,  4.90it/s]   


In [26]:
df_rows

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,user_id
target,0.112850,-0.078833,0.213959,0.213959,0.210116,0.266249,0.213959,0.112850,-0.078833,0.213959,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
target,-0.420000,0.454622,-1.226920,0.588987,0.150387,0.373888,0.306409,0.184795,-0.078833,0.328902,...,0.150387,0.237741,0.210116,0.187543,0.266249,0.291984,0.295861,0.265339,0.213959,1
target,1.229104,1.665136,1.164579,0.112850,1.107699,0.072487,0.718581,0.237741,0.214231,0.169885,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2
target,0.075151,0.123396,-0.078833,0.210116,-3.048387,-1.489691,1.146491,0.078175,0.588987,0.150387,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3
target,0.213959,-0.083196,-0.185419,0.291984,0.105767,0.248519,0.113348,0.123396,0.110235,0.105663,...,0.187543,0.266249,0.291984,0.213959,0.248519,0.112356,0.123396,0.187543,0.295861,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
target,0.213959,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,415312
target,0.295861,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,415313
target,0.210116,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,415314
target,0.265339,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,415315


In [27]:
rocket = ROCKET(n_kernels=10, random_state=42)
rocket.fit(df_rows[list(range(500))])
rocket_features = rocket.transform(df_rows[list(range(500))])
rocket_features = pd.concat([df_rows[['user_id']].reset_index(), pd.DataFrame(rocket_features)], axis=1)
rocket_features.drop(columns=['index'], axis=1, inplace=True)
rocket_features

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,0,4.439574,0.061983,2.320723,0.029730,9.292198,0.928,14.750319,0.076,10.496589,...,9.275704,0.102,7.858843,0.924,3.119997,0.073913,7.504944,0.952,3.744052,0.057229
1,1,9.149819,0.260331,8.279064,0.254054,14.790481,0.644,25.890186,0.336,18.198501,...,14.024299,0.372,15.050091,0.636,4.396009,0.226087,13.790840,0.718,10.333383,0.394578
2,2,2.022579,0.053719,1.646070,0.075676,6.372864,0.894,6.021307,0.118,4.985241,...,5.136109,0.166,3.557221,0.834,0.828088,0.030435,4.105526,0.876,2.458353,0.186747
3,3,3.104136,0.033058,1.209385,0.029730,6.694238,0.946,9.855819,0.096,7.140653,...,5.951345,0.112,5.386926,0.886,1.915612,0.013043,5.260706,0.946,3.355838,0.069277
4,4,5.536781,0.117769,4.579751,0.172973,9.896496,0.804,12.208422,0.202,11.420965,...,7.749922,0.282,8.092218,0.766,2.787221,0.108696,8.180556,0.830,6.700410,0.271084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415312,415312,-0.874212,0.000000,-0.602684,0.000000,1.135457,1.000,-0.147816,0.000,0.946461,...,-0.090727,0.000,0.636952,1.000,-0.932870,0.000000,1.142973,1.000,-0.358518,0.000000
415313,415313,-0.869274,0.000000,-0.602684,0.000000,1.281077,1.000,0.028443,0.002,1.139436,...,0.080100,0.002,0.774867,0.998,-0.932870,0.000000,1.228533,1.000,-0.349113,0.000000
415314,415314,-0.874444,0.000000,-0.602684,0.000000,1.128625,1.000,-0.156085,0.000,0.937408,...,-0.098741,0.000,0.630482,1.000,-0.932870,0.000000,1.138959,1.000,-0.358960,0.000000
415315,415315,-0.871114,0.000000,-0.602684,0.000000,1.226811,1.000,-0.037242,0.000,1.067522,...,0.016440,0.002,0.723472,0.998,-0.932870,0.000000,1.196648,1.000,-0.352618,0.000000


In [28]:
rocket_features.to_csv('rocket_features.csv', index=False)
gc.collect()

87672

### Генерация признаков по придуманному таргету

In [29]:
data = get_data(columns=['user_id', 'url_host'])
if GET_A_PART_OF_DATA:
    data = data[0:1000000]
data.drop_duplicates(inplace=True)

In [30]:
male_fraction = pd.read_csv('sites_male_fraction.csv').fillna(0)
male_fraction = male_fraction.loc[male_fraction.male_fraction != 0]
male_fraction['male_fraction'] = male_fraction['male_fraction'] - 0.5
male_fraction = male_fraction[['url_host', 'male_fraction']]
male_fraction

Unnamed: 0,url_host,male_fraction
0,-1,0.326404
1,0-1.ru,0.342105
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,0.083881
3,003ms.ru,-0.056391
4,010203.org,-0.030612
...,...,...
31079,zynzyn.ru,-0.123188
31080,zz-shop.ru,0.088235
31081,zznaki.ru,-0.245763
31082,zzz.fm,0.000000


In [31]:
median_age = pd.read_csv('sites_age.csv').fillna(0)
median_age = median_age.loc[median_age.median_age != 0]
median_age['median_age'] = median_age['median_age'] - 18
median_age = median_age[['url_host', 'median_age']]
median_age

Unnamed: 0,url_host,median_age
0,-1,12.0
1,0-1.ru,16.0
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,12.0
3,003ms.ru,22.0
4,010203.org,18.0
...,...,...
31079,zynzyn.ru,16.5
31080,zz-shop.ru,18.0
31081,zznaki.ru,12.5
31082,zzz.fm,18.5


In [32]:
target = median_age.merge(male_fraction, how='left')
target = target.loc[~target.male_fraction.isna()]
target['target'] = target['median_age'] * target['male_fraction']
target = target[['url_host', 'target']]
target

Unnamed: 0,url_host,target
0,-1,3.916854
1,0-1.ru,5.473684
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,1.006572
3,003ms.ru,-1.240602
4,010203.org,-0.551020
...,...,...
31079,zynzyn.ru,-2.032609
31080,zz-shop.ru,1.588235
31081,zznaki.ru,-3.072034
31082,zzz.fm,0.000000


In [33]:
data = data.merge(target, how='left')
data['target'] = data['target'].fillna(0)
# data = data[['user_id', 'target']]
data

Unnamed: 0,user_id,url_host,target
0,45098,ad.adriver.ru,0.184795
1,45098,apple.com,-0.081510
2,45098,avatars.mds.yandex.net,0.295861
3,45098,googleads.g.doubleclick.net,0.213959
4,45098,i.ytimg.com,0.291984
...,...,...,...
32277664,300964,youtube.com,0.816789
32277665,300964,biosfera.kz,-6.373206
32277666,300964,chihuahuadog-ru.turbopages.org,0.000000
32277667,300964,sun9-88.userapi.com,0.132380


In [34]:
def q10(x): return x.quantile(0.1)
def q25(x): return x.quantile(0.25)
def q75(x): return x.quantile(0.75)
def q90(x): return x.quantile(0.9)
def g1(x): return x[(x>=-10) & (x<-9)].count()
def g2(x): return x[(x>=-9) & (x<-8)].count()
def g3(x): return x[(x>=-8) & (x<-7)].count()
def g4(x): return x[(x>=-7) & (x<-6)].count()
def g5(x): return x[(x>=-6) & (x<-5)].count()
def g6(x): return x[(x>=-5) & (x<-4)].count()
def g7(x): return x[(x>=-4) & (x<-3)].count()
def g8(x): return x[(x>=-3) & (x<-2.5)].count()
def g9(x): return x[(x>=-2.5) & (x<-2)].count()
def g10(x): return x[(x>=-2) & (x<-1.5)].count()
def g11(x): return x[(x>=-1.5) & (x<-1)].count()
def g12(x): return x[(x>=-1) & (x<-0.5)].count()
def g13(x): return x[(x>=-0.5) & (x<-0.25)].count()
def g14(x): return x[(x>=-0.25) & (x<0.0)].count()
def g15(x): return x[(x>=0.0) & (x<0.25)].count()
def g16(x): return x[(x>=0.25) & (x<0.5)].count()
def g17(x): return x[(x>=0.5) & (x<1)].count()
def g18(x): return x[(x>=1) & (x<1.5)].count()
def g19(x): return x[(x>=1.5) & (x<2)].count()
def g20(x): return x[(x>=2) & (x<2.5)].count()
def g21(x): return x[(x>=2.5) & (x<3)].count()
def g22(x): return x[(x>=3) & (x<4)].count()
def g23(x): return x[(x>=4) & (x<5)].count()
def g24(x): return x[(x>=5) & (x<6)].count()
def g25(x): return x[(x>=6) & (x<7)].count()
def g26(x): return x[(x>=7) & (x<8)].count()
def g27(x): return x[(x>=8) & (x<9)].count()
def g28(x): return x[(x>=9) & (x<10)].count()

df = data.groupby('user_id', as_index=False).agg(t_max=('target', 'max'),
                                                 t_min=('target', 'min'),
                                                 t_avg=('target', 'mean'),
                                                 t_med=('target', 'median'),
                                                 t_q10=('target', q10),
                                                 t_q25=('target', q25),
                                                 t_q75=('target', q75),
                                                 t_q90=('target', q90),
                                                 t_std=('target', 'std'),
                                                 g1=('target', g1),
                                                 g2=('target', g2),
                                                 g3=('target', g3),
                                                 g4=('target', g4),
                                                 g5=('target', g5),
                                                 g6=('target', g6),
                                                 g7=('target', g7),
                                                 g8=('target', g8),
                                                 g9=('target', g9),
                                                 g10=('target', g10),
                                                 g11=('target', g11),
                                                 g12=('target', g12),
                                                 g13=('target', g13),
                                                 g14=('target', g14),
                                                 g15=('target', g15),
                                                 g16=('target', g16),
                                                 g17=('target', g17),
                                                 g18=('target', g18),
                                                 g19=('target', g19),
                                                 g20=('target', g20),
                                                 g21=('target', g21),
                                                 g22=('target', g22),
                                                 g23=('target', g23),
                                                 g24=('target', g24),
                                                 g25=('target', g25),
                                                 g26=('target', g26),
                                                 g27=('target', g27),
                                                 g28=('target', g28))

df['t_scope'] = (df['t_max'] + 100) - (df['t_min'] + 100)
df

Unnamed: 0,user_id,t_max,t_min,t_avg,t_med,t_q10,t_q25,t_q75,t_q90,t_std,...,g20,g21,g22,g23,g24,g25,g26,g27,g28,t_scope
0,0,0.971487,-5.448161,-0.279598,0.119349,-1.433696,-0.370504,0.265567,0.371307,1.099720,...,0,0,0,0,0,0,0,0,0,6.419647
1,1,5.164960,-8.875740,0.055350,0.200524,-1.763382,-0.597185,0.950591,1.927980,1.777170,...,9,3,0,0,1,0,0,0,0,14.040699
2,2,1.666667,-2.308666,0.246447,0.210116,-0.234870,0.108001,0.317656,1.164579,0.672931,...,0,0,0,0,0,0,0,0,0,3.975333
3,3,2.833333,-3.853880,0.098709,0.169885,-0.415677,0.108001,0.279117,0.891024,1.003197,...,0,1,0,0,0,0,0,0,0,6.687213
4,4,2.971429,-4.940000,-0.366893,0.105715,-2.373827,-0.510355,0.292954,0.624149,1.355972,...,0,2,0,0,0,0,0,0,0,7.911429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415312,415312,0.213959,0.213959,0.213959,0.213959,0.213959,0.213959,0.213959,0.213959,,...,0,0,0,0,0,0,0,0,0,0.000000
415313,415313,0.295861,0.295861,0.295861,0.295861,0.295861,0.295861,0.295861,0.295861,,...,0,0,0,0,0,0,0,0,0,0.000000
415314,415314,0.210116,0.210116,0.210116,0.210116,0.210116,0.210116,0.210116,0.210116,,...,0,0,0,0,0,0,0,0,0,0.000000
415315,415315,0.265339,0.265339,0.265339,0.265339,0.265339,0.265339,0.265339,0.265339,,...,0,0,0,0,0,0,0,0,0,0.000000


In [35]:
df.to_csv('pseudo_target_features.csv', index=False)
gc.collect()

0

### Генерация признаков TSFRESH

In [11]:
data = get_data(columns=['user_id', 'url_host', 'date', 'part_of_day'])
if GET_A_PART_OF_DATA:
    data = data[0:1000000]
# data.drop_duplicates(inplace=True)

In [12]:
sites_pop = data.groupby('url_host', as_index=False)[['user_id']].nunique()
sites_pop.columns = ['url_host', 'user_id_count']
unique_users = list(data.user_id.unique())
data.part_of_day = data.part_of_day.replace({'night': 0, 'morning': 1, 'day': 2, 'evening': 3})
data.date = pd.to_datetime(data.date)
data = data.merge(sites_pop, how='left')
data

Unnamed: 0,user_id,url_host,date,part_of_day,user_id_count
0,45098,ad.adriver.ru,2022-06-15,1,291568
1,45098,apple.com,2022-06-19,1,143639
2,45098,avatars.mds.yandex.net,2022-06-12,2,382692
3,45098,googleads.g.doubleclick.net,2022-05-16,2,394562
4,45098,googleads.g.doubleclick.net,2022-05-30,2,394562
...,...,...,...,...,...
322899430,300964,avatars.mds.yandex.net,2021-07-12,1,382692
322899431,300964,googleads.g.doubleclick.net,2021-06-20,3,394562
322899432,300964,online.sberbank.ru,2021-08-05,2,315435
322899433,300964,s0.2mdn.net,2021-07-19,3,298124


In [13]:
male_fraction = pd.read_csv('sites_male_fraction.csv').fillna(0)
male_fraction = male_fraction.loc[male_fraction.male_fraction != 0]
male_fraction['male_fraction'] = male_fraction['male_fraction'] - 0.5
male_fraction = male_fraction[['url_host', 'male_fraction']]
male_fraction

Unnamed: 0,url_host,male_fraction
0,-1,0.326404
1,0-1.ru,0.342105
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,0.083881
3,003ms.ru,-0.056391
4,010203.org,-0.030612
...,...,...
31079,zynzyn.ru,-0.123188
31080,zz-shop.ru,0.088235
31081,zznaki.ru,-0.245763
31082,zzz.fm,0.000000


In [14]:
median_age = pd.read_csv('sites_age.csv').fillna(0)
median_age = median_age.loc[median_age.median_age != 0]
median_age['median_age'] = median_age['median_age'] - 18
median_age = median_age[['url_host', 'median_age']]
median_age

Unnamed: 0,url_host,median_age
0,-1,12.0
1,0-1.ru,16.0
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,12.0
3,003ms.ru,22.0
4,010203.org,18.0
...,...,...
31079,zynzyn.ru,16.5
31080,zz-shop.ru,18.0
31081,zznaki.ru,12.5
31082,zzz.fm,18.5


In [15]:
target = median_age.merge(male_fraction, how='left')
target = target.loc[~target.male_fraction.isna()]
target['target'] = target['median_age'] * target['male_fraction']
target = target[['url_host', 'target']]
target

Unnamed: 0,url_host,target
0,-1,3.916854
1,0-1.ru,5.473684
2,0-hi--tech-mail-ru-0.cdn.ampproject.org,1.006572
3,003ms.ru,-1.240602
4,010203.org,-0.551020
...,...,...
31079,zynzyn.ru,-2.032609
31080,zz-shop.ru,1.588235
31081,zznaki.ru,-3.072034
31082,zzz.fm,0.000000


In [16]:
data = data.sort_values(by=['date', 'part_of_day', 'user_id_count'])
data = data.merge(target, how='left')
data['target'] = data['target'].fillna(0)
data.reset_index(inplace=True)
data['time'] = data.index
data

Unnamed: 0,index,user_id,url_host,date,part_of_day,user_id_count,target,time
0,0,271027,client.tamarix-group.ru,2021-06-16,0,1,0.000000,0
1,1,271027,tamarix-group.ru,2021-06-16,0,1,0.000000,1
2,2,100795,zahar-don.livejournal.com,2021-06-16,0,1,0.000000,2
3,3,93981,g_4.edu54.ru,2021-06-16,0,1,0.000000,3
4,4,317954,gastrogid.ru,2021-06-16,0,1,0.000000,4
...,...,...,...,...,...,...,...,...
322899430,322899430,406073,yandex.ru,2022-11-01,3,386405,0.265339,322899430
322899431,322899431,220333,googleads.g.doubleclick.net,2022-11-01,3,394562,0.213959,322899431
322899432,322899432,28719,googleads.g.doubleclick.net,2022-11-01,3,394562,0.213959,322899432
322899433,322899433,330810,googleads.g.doubleclick.net,2022-11-01,3,394562,0.213959,322899433


In [17]:
# %%time
# data_short = pd.DataFrame(columns = ['user_id', 'target', 'time'])

# groups = data[['user_id', 'target', 'time']].groupby("user_id")
 
# for user_id, group in tqdm(groups):
#     group = group[-500:].reset_index(drop=True)
#     data_short = data_short.append(group)
    
# data_short

In [18]:
from parallel_pandas import ParallelPandas
from tqdm import tqdm
tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [19]:
groups = data[['user_id', 'target', 'time']].groupby("user_id")

In [20]:
data_short = groups.parallel_apply(lambda x: x[-500:].reset_index(drop=True)).reset_index(drop=True)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=51915), Label(value='0 / 51915')))…

In [21]:
data_short

Unnamed: 0,user_id,target,time
0,0,0.112850,2942626
1,0,-0.078833,3978175
2,0,0.213959,7055864
3,0,0.213959,11326483
4,0,0.210116,12263440
...,...,...,...
132374090,415312,0.213959,118959401
132374091,415313,0.295861,56272278
132374092,415314,0.210116,25273099
132374093,415315,0.265339,19897133


In [22]:
del groups, data, median_age

In [23]:
features = extract_features(data_short, column_id="user_id", column_sort="time")

Feature Extraction: 100%|██████████| 40/40 [5:23:59<00:00, 485.99s/it]   


In [24]:
features

Unnamed: 0,target__variance_larger_than_standard_deviation,target__has_duplicate_max,target__has_duplicate_min,target__has_duplicate,target__sum_values,target__abs_energy,target__mean_abs_change,target__mean_change,target__mean_second_derivative_central,target__median,...,target__fourier_entropy__bins_5,target__fourier_entropy__bins_10,target__fourier_entropy__bins_100,target__permutation_entropy__dimension_3__tau_1,target__permutation_entropy__dimension_4__tau_1,target__permutation_entropy__dimension_5__tau_1,target__permutation_entropy__dimension_6__tau_1,target__permutation_entropy__dimension_7__tau_1,target__query_similarity_count__query_None__threshold_0.0,target__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,0.0,1.0,-8.674354,91.912711,0.527158,0.001378,0.002180,0.213959,...,0.829469,1.465092,3.304688,1.754357,2.975926,4.000702,4.481295,4.668863,,2.984903
1,0.0,0.0,0.0,1.0,122.806091,523.555406,0.600189,0.001270,-0.000930,0.266249,...,1.313138,1.966985,3.917361,1.778925,3.093842,4.404232,5.420570,5.906057,,5.285522
2,0.0,0.0,0.0,1.0,81.268628,58.749522,0.225181,-0.002860,-0.000726,0.213959,...,1.253945,1.905160,3.928769,1.770611,3.106697,4.553892,5.496305,5.790599,,1.585425
3,0.0,0.0,0.0,1.0,35.483557,54.142591,0.304535,0.000742,-0.000268,0.210116,...,1.172143,1.816700,3.676463,1.773843,3.073828,4.291811,4.894031,5.109728,,2.122177
4,0.0,0.0,0.0,1.0,28.849067,245.480834,0.335540,0.000164,0.000407,0.210116,...,0.782108,1.459794,3.430840,1.773833,3.068489,4.385781,5.377152,5.792445,,4.102786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415312,0.0,0.0,0.0,0.0,0.213959,0.045778,,,,0.213959,...,,,,,,,,,,
415313,0.0,0.0,0.0,0.0,0.295861,0.087534,,,,0.295861,...,,,,,,,,,,
415314,0.0,0.0,0.0,0.0,0.210116,0.044149,,,,0.210116,...,,,,,,,,,,
415315,0.0,0.0,0.0,0.0,0.265339,0.070405,,,,0.265339,...,,,,,,,,,,


In [25]:
features['user_id'] = features.index
tmp = features.copy()

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

target = get_target(columns=['user_id','age'])
target['user_id'] = target['user_id'].astype('int32')

# добавляем target, удаляем nan
tmp = tmp.merge(target[['age','user_id']], on = 'user_id', how = 'inner')
tmp = tmp.loc[~(tmp['age'].isna()) & (tmp['age'] > 18) & (tmp['age'] != 'NA')]
tmp['age'] = tmp['age'].astype('int16')

tmp['age'] = tmp['age'].map(age_bucket)

In [26]:
# from tsfresh.utilities.dataframe_functions import impute
# impute(features)

from tsfresh import select_features
tmp = tmp.fillna(0)
filtered_features = select_features(tmp[[el for el in tmp.columns if el not in ['age', 'user_id']]], tmp['age'])

In [27]:
filtered_features

Unnamed: 0,target__quantile__q_0.3,target__count_below_mean,target__approximate_entropy__m_2__r_0.1,target__quantile__q_0.2,target__benford_correlation,target__quantile__q_0.4,target__quantile__q_0.6,"target__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4",target__number_cwt_peaks__n_5,target__median,...,target__ratio_beyond_r_sigma__r_2.5,"target__linear_trend__attr_""pvalue""",target__autocorrelation__lag_7,target__autocorrelation__lag_9,target__partial_autocorrelation__lag_8,target__partial_autocorrelation__lag_7,target__autocorrelation__lag_8,"target__fft_coefficient__attr_""imag""__coeff_50","target__fft_coefficient__attr_""imag""__coeff_12","target__fft_coefficient__attr_""imag""__coeff_42"
0,0.112850,32.0,0.577442,-0.078833,0.654650,0.169885,0.213959,-0.015529,10.0,0.213959,...,0.045802,0.329265,0.031420,-0.040978,-0.060959,0.018066,-0.037179,9.083020,-9.407583,2.622689
1,0.213959,200.0,0.784973,0.169885,0.606113,0.254300,0.295861,-0.074680,31.0,0.266249,...,0.042000,0.817310,-0.057020,-0.010856,0.018803,-0.047417,-0.005880,0.375528,-20.643940,-12.145269
2,0.187543,192.0,1.081941,0.112850,0.657550,0.210116,0.248519,0.017080,24.0,0.213959,...,0.039326,0.801496,-0.092795,0.002957,0.105403,-0.094473,0.081623,5.361471,-0.539229,7.642261
3,0.118156,83.0,0.967729,0.110235,0.791084,0.187543,0.229029,0.030856,15.0,0.210116,...,0.026596,0.015810,0.059412,-0.021544,-0.117009,0.057149,-0.107966,7.514452,1.004325,-4.074736
4,0.122845,87.0,0.814251,0.105663,0.835713,0.184795,0.229029,-0.004406,28.0,0.210116,...,0.040000,0.956372,-0.019732,-0.016955,-0.035140,0.017820,-0.054507,13.085436,7.130045,4.226449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269995,0.265339,0.0,0.000000,0.265339,0.295657,0.265339,0.265339,0.000000,0.0,0.265339,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
269996,0.265339,0.0,0.000000,0.265339,0.295657,0.265339,0.265339,0.000000,0.0,0.265339,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
269997,-0.562510,0.0,0.000000,-0.562510,-0.145280,-0.562510,-0.562510,0.000000,0.0,-0.562510,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
269998,0.213959,0.0,0.000000,0.213959,0.295657,0.213959,0.213959,0.000000,0.0,0.213959,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [28]:
filtered_features.columns

Index(['target__quantile__q_0.3', 'target__count_below_mean',
       'target__approximate_entropy__m_2__r_0.1', 'target__quantile__q_0.2',
       'target__benford_correlation', 'target__quantile__q_0.4',
       'target__quantile__q_0.6',
       'target__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.4',
       'target__number_cwt_peaks__n_5', 'target__median',
       ...
       'target__ratio_beyond_r_sigma__r_2.5',
       'target__linear_trend__attr_"pvalue"', 'target__autocorrelation__lag_7',
       'target__autocorrelation__lag_9',
       'target__partial_autocorrelation__lag_8',
       'target__partial_autocorrelation__lag_7',
       'target__autocorrelation__lag_8',
       'target__fft_coefficient__attr_"imag"__coeff_50',
       'target__fft_coefficient__attr_"imag"__coeff_12',
       'target__fft_coefficient__attr_"imag"__coeff_42'],
      dtype='object', length=473)

In [29]:
features[['user_id'] + list(filtered_features.columns)]

Unnamed: 0,user_id,target__quantile__q_0.3,target__count_below_mean,target__approximate_entropy__m_2__r_0.1,target__quantile__q_0.2,target__benford_correlation,target__quantile__q_0.4,target__quantile__q_0.6,"target__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4",target__number_cwt_peaks__n_5,...,target__ratio_beyond_r_sigma__r_2.5,"target__linear_trend__attr_""pvalue""",target__autocorrelation__lag_7,target__autocorrelation__lag_9,target__partial_autocorrelation__lag_8,target__partial_autocorrelation__lag_7,target__autocorrelation__lag_8,"target__fft_coefficient__attr_""imag""__coeff_50","target__fft_coefficient__attr_""imag""__coeff_12","target__fft_coefficient__attr_""imag""__coeff_42"
0,0,0.112850,32.0,0.577442,-0.078833,0.654650,0.169885,0.213959,-0.015529,10.0,...,0.045802,0.329265,0.031420,-0.040978,-0.060959,0.018066,-0.037179,9.083020,-9.407583,2.622689
1,1,0.213959,200.0,0.784973,0.169885,0.606113,0.254300,0.295861,-0.074680,31.0,...,0.042000,0.817310,-0.057020,-0.010856,0.018803,-0.047417,-0.005880,0.375528,-20.643940,-12.145269
2,2,0.187543,192.0,1.081941,0.112850,0.657550,0.210116,0.248519,0.017080,24.0,...,0.039326,0.801496,-0.092795,0.002957,0.105403,-0.094473,0.081623,5.361471,-0.539229,7.642261
3,3,0.118156,83.0,0.967729,0.110235,0.791084,0.187543,0.229029,0.030856,15.0,...,0.026596,0.015810,0.059412,-0.021544,-0.117009,0.057149,-0.107966,7.514452,1.004325,-4.074736
4,4,0.122845,87.0,0.814251,0.105663,0.835713,0.184795,0.229029,-0.004406,28.0,...,0.040000,0.956372,-0.019732,-0.016955,-0.035140,0.017820,-0.054507,13.085436,7.130045,4.226449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415312,415312,0.213959,0.0,0.000000,0.213959,0.295657,0.213959,0.213959,0.000000,0.0,...,0.000000,,,,,,,,,
415313,415313,0.295861,0.0,0.000000,0.295861,0.295657,0.295861,0.295861,0.000000,0.0,...,0.000000,,,,,,,,,
415314,415314,0.210116,0.0,0.000000,0.210116,0.295657,0.210116,0.210116,0.000000,0.0,...,0.000000,,,,,,,,,
415315,415315,0.265339,0.0,0.000000,0.265339,0.295657,0.265339,0.265339,0.000000,0.0,...,0.000000,,,,,,,,,


In [30]:
features[['user_id'] + list(filtered_features.columns)].to_csv('tsfresh_features.csv', index=False)
gc.collect()

0

In [34]:
features[['user_id'] + list(filtered_features.columns)].isna().sum().sort_values(ascending=False)[:20]

target__friedrich_coefficients__coeff_2__m_3__r_30    393468
target__friedrich_coefficients__coeff_3__m_3__r_30    393468
target__max_langevin_fixed_point__m_3__r_30           393468
target__friedrich_coefficients__coeff_0__m_3__r_30    393468
target__fft_coefficient__attr_"abs"__coeff_99         142617
target__fft_coefficient__attr_"abs"__coeff_98         141863
target__fft_coefficient__attr_"abs"__coeff_97         141138
target__fft_coefficient__attr_"imag"__coeff_96        140395
target__fft_coefficient__attr_"abs"__coeff_96         140395
target__fft_coefficient__attr_"abs"__coeff_95         139649
target__fft_coefficient__attr_"abs"__coeff_94         138907
target__fft_coefficient__attr_"abs"__coeff_93         138161
target__fft_coefficient__attr_"abs"__coeff_92         137384
target__fft_coefficient__attr_"abs"__coeff_91         136628
target__fft_coefficient__attr_"abs"__coeff_90         135817
target__fft_coefficient__attr_"abs"__coeff_89         135031
target__fft_coefficient_