**In adversarial validation, the test and training dataset is first marked by a binary dummy. Then one tries to classify whether an observation is to be assigned to the training or test dataset. If the split between the training and the test dataset is not systematic - i.e. the split is only random - then an assignment should not be possible. In this case, an AUC of 0.5 or an accuracy equal to the proportion of the test dataset to all data would occur. If the metrics are higher, this is an indication of dissimilarity. The degree of dissimilarity can be concluded depending on the characteristics of the metrics. With an AUC of 1, the datasets would be systematically different. If the metrics are less pronounced, it can be assumed that at least partial systematics exist that make the datasets different from another. E.g. in one of the two datasets patterns exist, which do not occur in the other.**

In [None]:
from time import time
import psutil

import sys

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':13})

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression as LOGIT
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import accuracy_score as acu

import gc

import warnings
warnings.filterwarnings("ignore")

In [None]:
class NBConfig:
    general = {
        "seed": 2021,
        "folds": 4
    }
    method = {
        "jobs": 4, 
        "criterion": "gini",
        "samples": 0.65,
        "feat_frac": 0.55,
        "depth": 20,
        "n_trees": 130,
        "leafSize": 1,
        "costs": 0.0
    }

In [None]:
trainpath = "../input/ventilator-pressure-prediction/train.csv"
testpath = "../input/ventilator-pressure-prediction/test.csv"
samsubpath = "../input/ventilator-pressure-prediction/sample_submission.csv"
train, test, samSub = pd.read_csv(trainpath, index_col="id"), pd.read_csv(testpath, index_col="id"), pd.read_csv(samsubpath)

In [None]:
train["Dummy"] = 0
test["Dummy"] = 1

all_df = pd.concat([train.drop(columns=["pressure"]), test])
del(train, test)

In [None]:
print("Data shape is: " + str(all_df.shape))

In [None]:
all_df.tail()

In [None]:
all_df.describe()

# Feature Engineering

In [None]:
%%time
all_df.reset_index(drop=True, inplace=True)
all_df["timeDiff"] = all_df["time_step"].groupby(all_df["breath_id"]).diff(1).fillna(-1)

In [None]:
%%time
all_df["maxu_in"] = all_df[["breath_id", "u_in"]].groupby("breath_id").transform("max")["u_in"]
all_df["minu_in"] = all_df[["breath_id", "u_in"]].groupby("breath_id").transform("min")["u_in"]
all_df["meanu_in"] = all_df[["breath_id", "u_in"]].groupby("breath_id").transform("mean")["u_in"]

**Define a function for flexible feature engineering**

In [None]:
def ByBreath(method: str, DF, lags=None, center=False, fillNas=0):
    
    start = time()

    output = pd.DataFrame()
    if center == True:
        c = "c"
    else:
        c = ""
    
    if method == "mean":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in", "u_out"]].groupby("breath_id").rolling(window=l, center=center).mean().fillna(fillNas)
            output[["{0}mu_in_l{1}".format(c, l), "{0}mu_out_l{1}".format(c, l)]] = agg[["u_in", "u_out"]]
            gc.collect()
            
    elif method == "max":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in"]].groupby("breath_id").rolling(window=l, center=center).max().fillna(fillNas)  
            output[["{0}mxu_in_l{1}".format(c, l)]] = agg[["u_in"]]
            gc.collect()
            
    elif method == "min":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in"]].groupby("breath_id").rolling(window=l, center=center).min().fillna(fillNas)  
            output[["{0}miu_in_l{1}".format(c, l)]] = agg[["u_in"]]
            gc.collect()
            
    elif method == "std":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in"]].groupby("breath_id").rolling(window=l, center=center).std().fillna(fillNas)  
            output["{0}su_in_l{1}".format(c, l)] = agg["u_in"]
            gc.collect()
            
    elif method == "shift":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in"]].groupby("breath_id").shift(l).fillna(fillNas)  
            output["sftu_in_l{0}".format(l)] = agg["u_in"]
            gc.collect()     
        
    elif method == "diff":
        if lags is None:
            sys.exit("specify lags")
        for l in lags:
            agg = \
            DF[["breath_id", "u_in"]].groupby("breath_id").diff(l).fillna(fillNas)  
            output["du_in_l{0}".format(l)] = agg["u_in"]
            gc.collect()  
            
    elif method == "log":
        output["lgu_in"] = np.log1p(DF["u_in"].values)
        gc.collect()  
        
    elif method == "cumsum":
            agg = \
            DF[["breath_id", "u_in", "u_out"]].groupby("breath_id").cumsum() 
            output[["csu_in", "csu_out"]] = agg[["u_in", "u_out"]]
            gc.collect()   
            
    elif method == "area":
            agg = \
            DF[["time_step", "u_in", "breath_id"]]
            agg["area"] = agg["time_step"] * agg["u_in"]
            output["area"] = agg.groupby("breath_id")["area"].cumsum()
            gc.collect()   
            
    elif method == "centering":
            agg = \
            DF[["u_in", "breath_id"]].groupby("breath_id").transform('mean')#does not aggregate like just mean()
            output["cenu_in"] = DF["u_in"] - agg["u_in"]
            gc.collect()  
    end = time()
    print(c + method + " created in " + str(round(end - start)) + " seconds." + "RAM usage: " + str(psutil.virtual_memory()[2]) + "%")
    return output

In [None]:
def assignment(DF, mDF):
    DF = DF.copy()
    colNames = mDF.columns
    for n in colNames:
        DF["{0}".format(n)] = mDF["{0}".format(n)].values
    gc.collect()
    return DF

In [None]:
all_df = assignment(all_df, ByBreath("area", all_df))
all_df = assignment(all_df, ByBreath("mean", all_df, lags=[6,9]))
all_df = assignment(all_df, ByBreath("mean", all_df, center=True, lags=[6]))
all_df = assignment(all_df, ByBreath("max", all_df, lags=[9]))
all_df = assignment(all_df, ByBreath("min", all_df, lags=[9]))
all_df = assignment(all_df, ByBreath("diff", all_df, lags=[1,2]))
all_df = assignment(all_df, ByBreath("log", all_df))
all_df = assignment(all_df, ByBreath("std", all_df, lags=[6]))
all_df = assignment(all_df, ByBreath("shift", all_df, lags=[-2,-1,1,2]))
all_df = assignment(all_df, ByBreath("cumsum", all_df))
all_df = assignment(all_df, ByBreath("centering", all_df))

Number of unique values after feature engineering

In [None]:
all_df.nunique().to_frame()

# Random Forest Approach

In [None]:
y = all_df.Dummy
names = [c for c in all_df.columns if c not in ["Dummy", "u_out", "mu_out", "cmu_out", "sftu_out_l-1", "sftu_out_l-2"]]
all_df = all_df[names]
gc.collect()

In [None]:
kf = KFold(
    n_splits=NBConfig.general["folds"], 
    random_state=NBConfig.general["seed"], 
    shuffle=True
)

In [None]:
baseline = 1-y.sum()/y.shape[0]
print("Baseline: " + str(round(baseline, 5)))

In [None]:
importances = []

for k, (train_index, test_index) in enumerate(kf.split(all_df)):
    
    print("Fold: " + str(k+1))
    x_tr, x_te = all_df.loc[train_index], all_df.loc[test_index]
    y_tr, y_te = y[train_index], y[test_index]  
    baseline = 1-y_te.sum()/y_te.shape[0]
    print("Train data has " + str(x_tr.shape[0]) + " observations" + \
          "\n" + "Test data has " + str(x_te.shape[0]) + " observations")
    print('RAM memory used after setting train and test:', psutil.virtual_memory()[2], "%")


    reg = RFC(
        random_state=NBConfig.general["seed"], 
        n_jobs=NBConfig.method["jobs"], 
        criterion=NBConfig.method["criterion"],
        max_samples=NBConfig.method["samples"],
        max_features=NBConfig.method["feat_frac"],
        max_depth=NBConfig.method["depth"],
        n_estimators=NBConfig.method["n_trees"],
        min_samples_leaf=NBConfig.method["leafSize"],
        ccp_alpha=NBConfig.method["costs"],
        verbose=0
    )

    reg.fit(x_tr.drop(columns=["breath_id"]), y_tr); gc.collect()
    gc.collect()
    
    score = acu(reg.predict(x_te.drop(columns=["breath_id"])), y_te)
    print("RF accuracy: " + str(score) + " is " + str(round((score - baseline), 3)) + " higher than the baseline!" + "\n")

    gc.collect()
    
    imps = reg.feature_importances_
    importances.append(imps)

**To some degree, it is possible to predict whether an observation belongs to the test or training data set. This means that training data set and test data set differ structurally (but only partially) in certain areas.
This circumstance can strengthen the shakeup.**



# Importances 

In [None]:
impts = pd.DataFrame(importances, columns=names[1:])
means = impts.mean(axis=0).to_frame(name="means")
stds = impts.std(axis=0)
means["stds"] = stds
means.sort_values(by="means", ascending=True, inplace=True)

fig, ax = plt.subplots(figsize=(15,15))
means.means.plot.barh(yerr=means.stds.values, ax=ax)
ax.set_title("Mean Feature Importances with Standard Deviation")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()