In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
submission = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")
labels = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")

train

In [None]:
train =train.merge(labels,how='left', on=["sequence"])
train.head()

In [None]:
def aggregated_features(df, aggregation_cols = ['sequence'], prefix = ''):
    agg_strategy = {'sensor_00': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_01': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_02': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_03': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_04': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_05': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_06': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_07': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_08': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_09': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_10': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_11': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_12': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                   }
    group = df.groupby(aggregation_cols).aggregate(agg_strategy)
    group.columns = ['_'.join(col).strip() for col in group.columns]
    group.columns = [str(prefix) + str(col) for col in group.columns]
    group.reset_index(inplace = True)
    
    temp = (df.groupby(aggregation_cols).size().reset_index(name = str(prefix) + 'size'))
    group = pd.merge(temp, group, how = 'left', on = aggregation_cols,)
    return group

In [None]:
train_merge_data = aggregated_features(train, aggregation_cols = ['sequence', 'subject'])
test_merge_data = aggregated_features(test, aggregation_cols = ['sequence', 'subject'])

In [None]:
train_subjects_merge_data = aggregated_features(train, aggregation_cols = ['subject'], prefix = 'subject_')
test_subjects_merge_data = aggregated_features(test, aggregation_cols = ['subject'], prefix = 'subject_')

In [None]:
train_subjects_merge_data.head()

In [None]:
train_merge_data = train_merge_data.merge(labels, how = 'left', on = 'sequence')

In [None]:
train_merge_data = train_merge_data.merge(train_subjects_merge_data, how = 'left', on = 'subject')
test_merge_data = test_merge_data.merge(test_subjects_merge_data, how = 'left', on = 'subject')
train_merge_data.head()

In [None]:
test_merge_data.head()

In [None]:
from sklearn import model_selection
from xgboost import XGBClassifier
from sklearn.metrics import auc, roc_curve, accuracy_score, roc_auc_score
import optuna

In [None]:
train_merge_data["kfold"] = -1
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=train_merge_data)):
    train_merge_data.loc[valid_indicies, "kfold"] = fold

In [None]:
useful_features = [c for c in train_merge_data.columns if c not in ("sequence", "subject", "state", "kfold")]
df_test = test_merge_data[useful_features]

In [None]:
df = train_merge_data.copy()

In [None]:
final_predictions = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.state
    yvalid = xvalid.state
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = XGBClassifier(n_estimators=6000,random_state=42, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor", eval_metric='auc')
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    #fpr, tpr, thresholds = roc_curve(yvalid, preds_valid, pos_label=2)
    print(fold, accuracy_score(yvalid, preds_valid))

In [None]:
roc_auc_score(yvalid,preds_valid)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)
    n_estimators = trial.suggest_int("n_estimators", 10000, 100000, 10000)

    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.state
    yvalid = xvalid.state

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]


    model = XGBClassifier(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,  eval_metric='auc',
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    return roc_auc_score(yvalid,preds_valid)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=5)

In [None]:
study.best_params

In [None]:
final_predictions = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.state
    yvalid = xvalid.state
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    params = {'learning_rate': 0.021003543474738977,
             'reg_lambda': 0.00027064040459653943,
             'reg_alpha': 0.0013907328901432419,
             'subsample': 0.6630394352593036,
             'colsample_bytree': 0.6406562669622201,
             'max_depth': 6,
             'n_estimators': 20000}
    
    model = XGBClassifier(random_state=42, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor", eval_metric='auc', **params)
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    #fpr, tpr, thresholds = roc_curve(yvalid, preds_valid, pos_label=2)
    print(fold, accuracy_score(yvalid, preds_valid))

In [None]:
sample_submission = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")
preds = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.state = preds
sample_submission.to_csv("submission.csv", index=False)
sample_submission