In [1]:
import gc
import sys
import os
import warnings
from tqdm import tqdm
import seaborn as sns    

warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np


import sys

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

import modules
import importlib
importlib.reload(modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat

In [2]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [3]:
def get_agg_amount_of_travel(df: pd.DataFrame,
                agg_col: str = "user_id",
                target_col: str = 'city_name',
                timestamp_col: str = 'timestamp',
                alias: str = None,
                sort: bool = False) -> pd.DataFrame:
    if alias:
        col_name = alias
    else:
        col_name = f'{agg_col}_amount_of_travel'

    df = df.merge(df.sort_values(timestamp_col).groupby(agg_col)[target_col].agg(
    amount_of_travel  = get_travel
    ).rename(columns={'amount_of_travel':col_name})
             , how='left', on=agg_col)
    if sort:
        return df.sort_values(by=agg_col)

    return df

In [4]:
from datetime_features import get_timestamp
from geopy.distance import geodesic as GD
from geo_features import map_cities

In [5]:
df = pandas_reduce_mem_usage( 
        pd.read_parquet(
            f'{LOCAL_DATA_PATH}competition_data_final_pqt/part-00000-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet'
    ))
df = map_cities(df, folder_path= "external_data/")
df = get_timestamp(df, date_col='date')
df = pandas_reduce_mem_usage(df)

Memory usage of dataframe is 2988.16 MB


  0%|          | 0/12 [00:00<?, ?it/s]

Memory usage after optimization is: 2521.26 MB
Decreased by 15.6%
Memory usage of dataframe is 0.05 MB


  0%|          | 0/6 [00:00<?, ?it/s]

Memory usage after optimization is: 0.03 MB
Decreased by 47.8%
Memory usage of dataframe is 3614.32 MB


  0%|          | 0/18 [00:00<?, ?it/s]

Memory usage after optimization is: 3487.51 MB
Decreased by 3.5%


In [6]:
import geopandas

In [8]:
def get_dist(points_df):
    temp = (points_df != points_df.shift(1))
    points_df=points_df[temp].set_crs(epsg=4326)

    points = points_df.distance(points_df.shift(1)).fillna(0)
    return points.mean()

def agg_get_dist(df, lat_col, lon_con):
    gdf = geopandas.GeoDataFrame(
        df[['user_id','city_name','timestamp']], 
        geometry=geopandas.points_from_xy(df[lat_col], df[lon_con]))
    print('sorting started')
    gdf=gdf.sort_values('timestamp')
    print('agg started')
    distances = gdf.groupby('user_id', as_index=True)['geometry'].agg(get_dist).astype(np.float32)
    print('merging started')
    df = df.merge(distances.rename({'geometry':'mean_travel_distance'}), how='left',on='user_id')
    return df

In [9]:
df = agg_get_dist(df,'geo_lat','geo_lon')

sorting started
agg started
merging started


In [None]:
df.head()

In [45]:
df = df.merge(, how='left', on='user_id')

In [40]:
distances

user_id
4         2.944318
16        0.000000
18        0.348726
26        0.000000
27        0.466315
            ...   
415251    0.000000
415266    0.000000
415273    0.000000
415277    0.000000
415305    0.000000
Name: geometry, Length: 41594, dtype: float64

In [49]:
def get_agg_aot(df: pd.DataFrame,
                agg_col: str = "user_id",
                target_col: str = 'city_name',
                timestamp_col: str = 'timestamp',
                alias: str = None,
                sort: bool = False) -> pd.DataFrame:
    if alias:
        col_name = alias
    else:
        col_name = f'{agg_col}_amount_of_travel'

    df = df.merge(df.sort_values(timestamp_col).groupby(agg_col)[target_col].agg(
    amount_of_travel  = get_travel
    ).rename(columns={'amount_of_travel':col_name})
             , how='left', on=agg_col)
    if sort:
        return df.sort_values(by=agg_col)

    return df

In [50]:
get_agg_aot(df)

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,timestamp,amount_of_travel,user_id_amount_of_travel
0,Оренбургская область,Оренбург,Apple,iPhone XR,eda.ru,smartphone,iOS,47854.0,2021-06-16,day,1,55415,162380160.0,0,0
1,Ростовская область,Ростов-на-Дону,Apple,iPhone XR,o2.mail.ru,smartphone,iOS,51208.0,2021-06-16,day,1,389762,162380160.0,34,34
2,Московская область,Серпухов,Samsung,Galaxy S8 Dual,ria.ru,smartphone,Android,54990.0,2021-06-16,day,1,30551,162380160.0,64,64
3,Санкт-Петербург,Санкт-Петербург,Huawei,Y9 2018,sun9-29.userapi.com,smartphone,Android,13901.0,2021-06-16,day,1,153830,162380160.0,20,20
4,Санкт-Петербург,Санкт-Петербург,Huawei,Y9 2018,yastatic.net,smartphone,Android,13901.0,2021-06-16,day,2,153830,162380160.0,20,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32638704,Омская область,Омск,Xiaomi,Redmi 6,yastatic.net,smartphone,Android,11663.0,2022-11-01,morning,1,220333,166726080.0,6,6
32638705,Омская область,Омск,Xiaomi,Redmi 6,yandex.ru,smartphone,Android,11663.0,2022-11-01,evening,5,220333,166726080.0,6,6
32638706,Омская область,Омск,Xiaomi,Redmi 6,yandex.ru,smartphone,Android,11663.0,2022-11-01,day,5,220333,166726080.0,6,6
32638707,Омская область,Омск,Xiaomi,Redmi 6,i.ytimg.com,smartphone,Android,11663.0,2022-11-01,night,1,220333,166726080.0,6,6
