In [None]:
import numpy as np
import pandas as pd
import gc

df = pd.read_csv("../input/remove-trends-giba-explained/train_clean_giba.csv").sort_values("time").reset_index(drop=True)
test_df = pd.read_csv("../input/remove-trends-giba-explained/test_clean_giba.csv").sort_values("time").reset_index(drop=True)

In [None]:
import multiprocessing
multiprocessing.cpu_count()

In [None]:
df["group"] = np.arange(df.shape[0])//500_000
df["mg"] = np.arange(df.shape[0])//100_000
df["group"].value_counts()

In [None]:
df["category"] = 0
test_df["category"] = 0

# train segments with more then 9 open channels classes
df.loc[2_000_000:2_500_000-1, 'category'] = 1
df.loc[4_500_000:5_000_000-1, 'category'] = 1

# test segments with more then 9 open channels classes (potentially)
test_df.loc[500_000:600_000-1, "category"] = 1
test_df.loc[700_000:800_000-1, "category"] = 1

In [None]:
TARGET = "open_channels"

aug_df = df[df["group"] == 5].copy()
aug_df["category"] = 1
aug_df["group"] = 10

for col in ["signal", TARGET]:
    aug_df[col] += df[df["group"] == 8][col].values
    
df = df.append(aug_df, sort=False)

In [None]:
df.groupby("group")["signal"].agg({"mean", "std"})

In [None]:
NUM_SHIFT = 20

features = ["signal", "signal"]

for i in range(1, NUM_SHIFT + 1):
    f_pos = "shift_{}".format(i)
    f_neg = "shift_{}".format(-i)
    features.append(f_pos)
    features.append(f_neg)
    for data in [df, test_df]:
        data[f_pos] = data["signal"].shift(i).fillna(-3) # Groupby shift!!!
        data[f_neg] = data["signal"].shift(-i).fillna(-3) # Groupby shift!!!
        
data.head()

In [None]:
#         model = RandomForestClassifier(
#                 n_estimators=40,
#                 max_samples=0.5,
#                 max_depth=17,
#                 max_features=10,
#                 min_samples_leaf=10,
#                 random_state=42,
#                 n_jobs=-1,
#                 verbose=1
#             )

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

NUM_FOLDS = 5
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros((len(df), 11))
y_test = np.zeros((len(test_df), 11))

for fold, (train_ind, val_ind) in enumerate(skf.split(df, df["group"])):
    train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]
    print(fold, len(train_df), len(val_df))

    for cat in range(2):
        fit_df = train_df[train_df["category"] == cat]
        y = fit_df[TARGET].values
        y[0] = 0 # hack
        #model.fit(fit_df[features], y)

        model = xgb.XGBRFClassifier( 
            n_estimators = 40,
            learning_rate=1,
            subsample=0.50,
            colsample_bynode=0.25,
            reg_lambda=1e-05,
            objective= 'multi:softmax',
            num_class= len(np.unique(y)),
            max_depth = 17,
            num_parallel_tree = 1,
            tree_method = 'gpu_hist',
            n_jobs = 2,
            verbosity = 0,
            predictor = 'gpu_predictor',
           ).fit( fit_df[features].values, y )
        
        pred = model.predict_proba(val_df[val_df["category"] == cat][features].values)
        oof_preds[val_ind[np.where(val_df["category"].values == cat)[0]], :pred.shape[1]] = pred
        
        y_test[np.where(test_df["category"].values == cat)[0], :pred.shape[1]] += model.predict_proba(test_df[test_df["category"] == cat][features].values)/NUM_FOLDS
        del model; _=gc.collect()
    

In [None]:
from sklearn.metrics import f1_score

f1_score(df["open_channels"], oof_preds.argmax(axis=1), average="macro")

In [None]:
test_df[TARGET] = y_test.argmax(axis=1)

test_df.iloc[:600_000][TARGET].value_counts()/600_000

In [None]:
np.save("y_oof.npy", oof_preds)
np.save("y_test.npy", y_test)

test_df.to_csv('submission.csv', index=False, float_format='%.4f', columns=["time", TARGET])