In [None]:
import pandas as pd
from tsfresh import extract_features, extract_relevant_features
from tsfresh.feature_extraction.settings import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute

from alfa_challenge.utils import cache_or_generate, reduce_df_size

DATA_FEATURES_FOLDER = "data/features"
TIME_PERIOD_MAPPING = {"M": "month", "W": "week", "D": "day"}

In [None]:
df_transactions = pd.read_parquet("data/df_transaction.pa")
df_train = pd.read_parquet("data/train.pa")

df_train = reduce_df_size(df_train)
df_transactions = reduce_df_size(df_transactions, drop=["merchant_name"])


# df_train = df_train[df_train["client_num"] < 500]
# df_transactions = df_transactions[df_transactions["client_num"] < 500]


test_clients = list(
    set(df_transactions["client_num"].unique().tolist()).difference(
        df_train["client_num"].to_list()
    )
)

transactions = {
    "train": df_transactions[~df_transactions["client_num"].isin(test_clients)],
    "test": df_transactions[df_transactions["client_num"].isin(test_clients)],
}

Memory usage of dataframe is 1.07 MB
Memory usage after optimization is: 0.33 MB
Decreased by 68.7%
Memory usage of dataframe is 515.30 MB
Memory usage after optimization is: 309.18 MB
Decreased by 40.0%


In [None]:
LOAD = False
AGGS = ["sum", "std", "mean", "min", "max"]


def generate_amount_ts(df, period):
    return reduce_df_size(
        df.set_index("date_time")
        .groupby("client_num")
        .resample(period)["amount"]
        .agg(AGGS)
        .reset_index()
    )


trans_amt_aggs = {}

for split, dataset in transactions.items():
    trans_amt_aggs[split] = {}
    for short_period, full_period in TIME_PERIOD_MAPPING.items():
        file_path = f"{DATA_FEATURES_FOLDER}/{split}_transactions_{full_period}.pkl"
        trans_amt_aggs[split][full_period] = cache_or_generate(
            file_path, generate_amount_ts, LOAD, dataset, short_period
        )

Not loading
Not loading
Not loading
Not loading
Not loading
Not loading


In [None]:
PARAMS = EfficientFCParameters()
N_JOBS = 16
LOAD = False


def extract_tsfresh_features(df, column_id, column_sort, column_value, y=None):
    df = df.fillna(0)
    result = None
    print("Extracting", column_value, "y=", type(y))
    if y is not None:
        result = extract_relevant_features(
            df,
            y=y,
            column_id=column_id,
            column_sort=column_sort,
            column_value=column_value,
            n_jobs=N_JOBS,
            default_fc_parameters=PARAMS,
        )
    else:
        result = extract_features(
            df,
            column_id=column_id,
            column_sort=column_sort,
            column_value=column_value,
            n_jobs=N_JOBS,
            default_fc_parameters=PARAMS,
            impute_function=impute,
        )
    print("Extracting", column_value, "shape=", result.shape)

    return result


def extract_tsfresh_features_ts_columns(df, column_id, column_sort, ts_columns, y=None):
    results = []
    for col in ts_columns:
        result = extract_tsfresh_features(df, column_id, column_sort, col, y)
        results.append(result)
    concat_df = pd.concat(results, axis=1)
    print("Concat shape", concat_df.shape)
    return concat_df


column_id = "client_num"
column_sort = "date_time"
y = df_train.set_index("client_num").iloc[:, 0]  # series required
column_value = "amount"

trans_amt_aggs_features = {}

# aggs
for split, aggs_datasets in trans_amt_aggs.items():
    trans_amt_aggs_features[split] = {}
    for full_period, dataset in aggs_datasets.items():
        file_path = (
            f"{DATA_FEATURES_FOLDER}/{split}_transactions_{full_period}_features.pkl"
        )
        trans_amt_aggs_features[split][full_period] = cache_or_generate(
            file_path,
            extract_tsfresh_features_ts_columns,
            LOAD,
            df=dataset,
            column_id=column_id,
            column_sort=column_sort,
            ts_columns=AGGS,
            y=y if split == "train" else None,
        )

# full
for split, dataset in transactions.items():
    file_path = f"{DATA_FEATURES_FOLDER}/{split}_transactions_full_features.pkl"

    trans_amt_aggs_features[split]["full"] = cache_or_generate(
        file_path,
        extract_tsfresh_features,
        LOAD,
        df=dataset,
        column_id=column_id,
        column_sort=column_sort,
        column_value=column_value,
        y=y if split == "train" else None,
    )

Not loading
Not loading
Not loading
Not loading
Not loading
Not loading
Not loading
Not loading


In [None]:
LOAD = True


def create_amount_ts_features(split_period_df_dict):
    for period, dataset in split_period_df_dict.items():
        split_period_df_dict[period] = dataset.add_prefix(f"ts_{period}_")

    amount_ts_features = pd.concat(list(split_period_df_dict.values()), axis=1)

    print("Shape before", amount_ts_features.shape)
    amount_ts_features = amount_ts_features.T.drop_duplicates().T
    amount_ts_features = amount_ts_features.loc[:, amount_ts_features.nunique() > 1]
    amount_ts_features = reduce_df_size(amount_ts_features)
    print("Shape after", amount_ts_features.shape)

    return amount_ts_features


ts_features = {}

for split, datasets_features in trans_amt_aggs_features.items():
    file_path = f"{DATA_FEATURES_FOLDER}/{split}_amount_ts_features.pkl"
    ts_features[split] = cache_or_generate(
        file_path,
        create_amount_ts_features,
        LOAD,
        split_period_df_dict=datasets_features,
    )

Loading data from data/features/train_amount_ts_features.pkl
Loading data from data/features/test_amount_ts_features.pkl


In [None]:
train_features = ts_features["train"]
test_features = ts_features["test"]

common_columns = train_features.columns.intersection(test_features.columns)
train_features = train_features[common_columns]
test_features = test_features[common_columns]

print(train_features.shape)
print(test_features.shape)

(70000, 4505)
(39143, 4505)


In [None]:
train_features = train_features.reset_index()
train_features = train_features.rename(columns={"index": "client_num"})
test_features = test_features.reset_index()
test_features = test_features.rename(columns={"index": "client_num"})

In [None]:
tsfresh_features = pd.concat([train_features, test_features])
tsfresh_features

Unnamed: 0,client_num,ts_month_sum_sum__sum_values,ts_month_sum_sum__quantile__q_0.3,ts_month_sum_sum__quantile__q_0.4,ts_month_sum_sum__quantile__q_0.6,ts_month_sum_sum__quantile__q_0.7,"ts_month_sum_sum__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)","ts_month_sum_sum__cwt_coefficients__coeff_0__w_5__widths_(2, 5, 10, 20)",ts_month_sum_sum__quantile__q_0.1,"ts_month_sum_sum__cwt_coefficients__coeff_0__w_10__widths_(2, 5, 10, 20)",...,ts_full_FULL_amount__ar_coefficient__coeff_1__k_10,"ts_full_FULL_amount__cwt_coefficients__coeff_9__w_2__widths_(2, 5, 10, 20)","ts_full_FULL_amount__cwt_coefficients__coeff_6__w_2__widths_(2, 5, 10, 20)","ts_full_FULL_amount__fft_coefficient__attr_""real""__coeff_19","ts_full_FULL_amount__fft_coefficient__attr_""real""__coeff_8",ts_full_FULL_amount__partial_autocorrelation__lag_1,ts_full_FULL_amount__autocorrelation__lag_1,"ts_full_FULL_amount__fft_coefficient__attr_""real""__coeff_16",ts_full_FULL_amount__symmetry_looking__r_0.2,"ts_full_FULL_amount__cwt_coefficients__coeff_10__w_2__widths_(2, 5, 10, 20)"
0,1,863878,223786.2,230229.6,273888.2,311103.4,3.553388e+05,250359.229482,210899.4,179891.523475,...,0.026840,64675.843368,-24414.939894,-2.650545e+04,4342.582521,0.039248,0.039248,1.394325e+05,1,36436.719427
1,2,344108,101008.2,107827.6,121499.8,128352.6,1.030089e+05,73869.259141,87369.4,53207.112289,...,0.008114,818.978370,823.623005,-6.458124e+04,35812.510880,0.001493,0.001493,1.776491e+04,1,-725.189242
2,3,1621825,69126.6,69467.8,352629.8,635450.6,9.377156e+05,600436.513367,68444.2,425395.933829,...,-0.012920,-280816.907484,369164.707969,-1.099448e+06,231174.792984,0.261533,0.261533,-1.293465e+06,1,-152611.379219
3,4,199796,56091.6,59900.8,69252.4,74794.8,8.192955e+04,58714.306632,48473.2,42287.386989,...,0.031599,-9153.776700,-4069.831964,-1.474321e+04,21025.580694,0.039427,0.039427,-5.235801e+04,1,8848.187406
4,5,67359,18945.8,19078.4,21288.8,23366.6,2.568245e+04,18251.089477,18680.6,13129.670410,...,-0.024653,-47.645553,-129.587023,-1.496116e+04,9915.134221,-0.026696,-0.026696,1.471015e+04,1,-178.094475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39138,109127,1960002,162000.0,216000.0,554000.4,838000.8,1.036464e+06,655517.237708,54000.0,463520.683968,...,0.008678,195299.048898,366507.938735,1.757362e+02,254.499047,0.479598,0.479598,2.363635e+02,1,-244150.505170
39139,109128,28357,28357.0,28357.0,28357.0,28357.0,1.739111e+04,10999.100776,28357.0,7777.538746,...,0.008678,-179.559063,-2291.657797,1.757362e+02,11163.000000,0.105864,0.105864,2.363635e+02,1,-407.724002
39140,109130,3588,767.2,852.6,1178.2,1418.4,7.826929e+02,550.341468,596.4,395.326961,...,0.008678,329.717067,-84.429569,1.757362e+02,132.000000,-0.274564,-0.274564,2.363635e+02,1,80.453643
39141,109137,3723,3723.0,3723.0,3723.0,3723.0,2.283284e+03,1444.075614,3723.0,1021.115659,...,0.008678,98.079456,16.601667,1.757362e+02,249.896781,-0.146298,-0.146298,2.363635e+02,1,52.914136


In [None]:
assert tsfresh_features.isna().sum().sum() == 0
tsfresh_features.to_parquet("data/features/tsfresh_generations.pa")