In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold,StratifiedKFold, RepeatedKFold
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix

train = pd.read_csv('../input/ingv-volcanic-eruption-prediction-lgbm-baseline/train.csv')
test = pd.read_csv('../input/ingv-volcanic-eruption-prediction-lgbm-baseline/test.csv')

# Adversarial Validation

It has come to some Kagglers attention that the CV scores do not match the LB scores, this is the main discussion thread (https://www.kaggle.com/c/predict-volcanic-eruptions-ingv-oe/discussion/192766). There are couple reasons this might be: many models are overfit, the train sets and test sets are not a good match, there are some outlier examples in train or test that might skew our models, there are some specific features generated that do match between train and test. Here we just try to determine/confirm a couple of these hypothesis using Adversarial Validation. We are simply trying to determine if train and test are good matches, to do so we are going to "hide" our test segments inside the train segments and have a model try to "find them". So, to clarify, if the two datasets are statistically similar it should be difficult for a model to find the test segments within train, if they are different it should be much easier.

#### Methodology
My previous notebook has been a popular choice for initial starter, some people spinning off additional notebooks and adding features and oversampling to improve results, but for this excercise I will just take the features generated in that notebook (https://www.kaggle.com/ajcostarino/ingv-volcanic-eruption-prediction-lgbm-baseline). I will then discard the `time_to_eruption` column, as that is not important for this task. I set a new target column, called `target`, to `0` for the train set and `1` for the test set. We then build a model to try and see if the the sets are distinguishable.

#### Calibration
Here I decided to use `binary_logloss` for my evaluation metric because I would prefer if the model was well calibrated. What does that mean? It means I want to make sure that the output probabilities match real-life, that is, if the model says a particular segment has `0.57` chance of being a test segment I would like that to be true 57% of the time. Logloss penalizes on confidence, so we can look at our segments and see how confident we are that they are test or train. The test and train sets match if the model spits out roughly `0.5` probability for every segment, if that happens then we are validated that the test and train sets are a good match. The classes are balanced, there are an equal amount of each. For our `binary_logloss` metric a model that cannot determine test or train, i.e a model that is random, will return a `binary_logloss` of about `0.7` (`0.69` to be more exact). So if our model is able to get a `binary_logloss` score lower than that we know that the model is not random and that there are differences between the test and train set that might be decisive and might lead to our CV LB differences.

Updated by making dataset public.

In [None]:
train = train.drop(['time_to_eruption'], axis = 1)

# Set target of train to 0 and test to 1
# We are trying to find the test segments if we hide them in train
train['target'] = 0
test['target'] = 1

all_segments = pd.concat([train, test])
all_segments = all_segments.set_index(['segment_id'])
all_segments.head(5)

In [None]:
def plot_roc_auc(cv_scores, cutoff):
    cv_scores = cv_scores.iloc[:cutoff]
    fpr0, tpr0, thresholds1 = roc_curve(cv_scores['fold_0_val'], cv_scores['fold_0_predict_proba'])
    fpr1, tpr1, thresholds2 = roc_curve(cv_scores['fold_1_val'], cv_scores['fold_1_predict_proba'])
    fpr2, tpr2, thresholds3 = roc_curve(cv_scores['fold_2_val'], cv_scores['fold_2_predict_proba'])
    fpr3, tpr3, thresholds4 = roc_curve(cv_scores['fold_3_val'], cv_scores['fold_3_predict_proba'])

    roc_auc0 = auc(fpr0, tpr0)
    roc_auc1 = auc(fpr1, tpr1)
    roc_auc2 = auc(fpr2, tpr2)
    roc_auc3 = auc(fpr3, tpr3)

    plt.figure()
    lw = 2

    fig, ax = plt.subplots(figsize = (11, 10))
    plt.plot(fpr0, tpr0, color='red',
             lw=lw, label='Fold 0 ROC curve (area = %0.2f)' % roc_auc0)
    plt.plot(fpr1, tpr1, color='green',
             lw=lw, label='Fold 1 ROC curve (area = %0.2f)' % roc_auc1)
    plt.plot(fpr2, tpr2, color='orange',
             lw=lw, label='Fold 2 ROC curve (area = %0.2f)' % roc_auc2)

    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([-0.01, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    
def feature_importances(model, index):
    feature_importances = pd.DataFrame(model.feature_importances_,
                                       index = index,
                                       columns=['importance']).sort_values('importance',
                                       ascending=False)

    feature_importances = feature_importances.reset_index()
    feature_importances.columns = ['feature', 'importance']

    fig, ax = plt.subplots(figsize = (18, 24))
    sns.set()
    plt.subplot(1, 1, 1);
    sns.barplot(x="importance", y="feature", orient='h', data=feature_importances.head(50));
    plt.title('Feature Importance');
    

def plot_lift_chart(df,
                    actual_col,
                    predicted_col,
                    probability_col,
                    ntiles):

    df = df.sort_values(by=probability_col, ascending=False)

    rows = []
    for group in np.array_split(df, ntiles):
        score = group[actual_col].sum()
        rows.append({'cases': len(group), 'correct': score})

    lift = pd.DataFrame(rows)

    #Cumulative Gains Calculation
    lift['cumcorrect'] = lift['correct'].cumsum()
    lift['ntile'] =  (float(len(df)) / float(ntiles)) / float(len(df)) * 100
    lift['cumntile'] = lift['ntile'].cumsum()
    lift['avgcorrect'] = df[actual_col].sum() / ntiles
    lift['cumavgcorrect'] = lift['avgcorrect'].cumsum()

    #Lift Chart
    lift['normalisedpercentavg'] = 1
    lift['normalisedpercentmodel'] = lift['cumcorrect'] / lift['cumavgcorrect']

    lift = lift.set_index('cumntile')

    #fig, ax = plt.subplots()
    
    return lift

#     ax.plot(lift['normalisedpercentavg'], 'r-', label='Normalised \'response rate\' with no model')
#     ax.plot(lift['normalisedpercentmodel'], 'g-', label='Normalised \'response rate\' with using model')
#     ax.set_xlim((-0.5,100))
#     ax.legend()
#     return ax

In [None]:
def build_lightgbm(X, y, n_fold):
    folds = KFold(n_splits=n_fold,
                  shuffle=True,
                  random_state=42)
    
    cv_scores = pd.DataFrame()
    
    params = {
        'num_leaves': 85,
        'min_data_in_leaf': 10, 
        'objective':'regression',
        'max_depth': -1,
        'learning_rate': 0.001,
        'max_bins': 2048,
        'boosting': "gbdt",
        'feature_fraction': 0.91,
        'bagging_freq': 1,
        'bagging_fraction': 0.91,
        'bagging_seed': 42,
        'metric': 'binary_logloss',
        'lambda_l1': 0.1,
        'verbosity': -1,
        'nthread': -1,
        'random_state': 42
    }

    fold = 0
    for fold_, (trn, val) in enumerate(folds.split(X, y)):
        
        strLog = "fold {}".format(fold_)
        print(strLog)
        
        X_trn, X_val = X.iloc[trn], X.iloc[val]
        y_trn, y_val = y.iloc[trn], y.iloc[val]
        
        lgbmc = lgbm.LGBMClassifier(**params, n_estimators = 2500, n_jobs = -1)
        
        eval_set = [
            (X_trn, y_trn),
            (X_val, y_val)
        ]
        
        lgbmc.fit(X_trn, y_trn, eval_set=eval_set, verbose=200, early_stopping_rounds=400)
        
        
        y_predict       = lgbmc.predict(X_val)
        y_predict_proba = lgbmc.predict_proba(X_val)[:,1]

        segment_col       = 'fold_{}_segment_id'.format(fold)
        predict_col       = 'fold_{}_predict'.format(fold)
        predict_proba_col = 'fold_{}_predict_proba'.format(fold)
        val_col           = 'fold_{}_val'.format(fold)
    
        cv_scores[segment_col]       = pd.Series(list(X_val.index))
        cv_scores[predict_col]       = pd.Series(y_predict)
        cv_scores[predict_proba_col] = pd.Series(y_predict_proba)
        cv_scores[val_col]           = pd.Series(list(y_val))
    
        fold += 1
        
    return cv_scores, lgbmc, X_trn.columns

## Model
So we can see here clearly that our model can determine some differences between test and train. However, `0.57` is not a great `binary_logloss` score so many of the train and test samples must be similar. There might be a subsegment of train and test that are recognizable. We will try and determine which ones those are.

In [None]:
cv_scores, lgbmc, columns = build_lightgbm(all_segments.drop(['target'], axis = 1), all_segments['target'], 5)

## ROC/AUC and Lift Curves
The ROC/AUC curve allows us to visualize what random looks like as well as what the output probabilities look like. We can see an extremely steep initial curve meaning that there must be a subset of the test set that are clearly distibguishable from the train set. After that the curve looks relatively linear meaning that the rest of the test set is indisinguishable from train. In the lift charts we see similar behavior. The highest confident predictions show that the model is able to disinguish a subset of the test observations, the must be much different from train. However it is good that after that small subset our train and test are more distinguishable.

In [None]:
plot_roc_auc(cv_scores, 1789)

In [None]:
fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(20, 10))

def plot_lift(ax, lift):
    ax.plot(lift['normalisedpercentavg'], 'r-', label='Normalised \'response rate\' with no model')
    ax.plot(lift['normalisedpercentmodel'], 'g-', label='Normalised \'response rate\' with using model')
    ax.set_xlim((-0.5,100))
    ax.legend()

plot_lift(axes[0, 0], plot_lift_chart(cv_scores,'fold_0_val','fold_0_predict','fold_0_predict_proba',100))
plot_lift(axes[0, 1], plot_lift_chart(cv_scores,'fold_1_val','fold_1_predict','fold_1_predict_proba',100))
plot_lift(axes[1, 0], plot_lift_chart(cv_scores,'fold_2_val','fold_2_predict','fold_2_predict_proba',100))
plot_lift(axes[1, 1], plot_lift_chart(cv_scores,'fold_3_val','fold_3_predict','fold_3_predict_proba',100))

## Important Adversarial Features

Which features are the most important in descriminating between the datasets. These features are potentially bad features as they might lead to some inferences in the train set that might not be true in the test set. We can also plot the distributions of the top features to see what the differences are between the test and train set.

In [None]:
feature_importances(lgbmc, columns)

In [None]:
def plot_distribution_curves(ax, all_segments, feature, xlim):
    sns.distplot(all_segments[all_segments['target'] == 0][feature], 30, ax = ax)
    sns.distplot(all_segments[all_segments['target'] == 1][feature], 30, ax = ax)
    ax.set_title(f'{feature} Distributions Train vs. Test')
    if len(xlim) != 0:
        ax.set_xlim(xlim)
    
fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(20, 12))
plot_distribution_curves(axes[0, 0], all_segments,'sensor_6_fft_real_median', (-2000, 2000))
plot_distribution_curves(axes[0, 1], all_segments,'sensor_10_sum', (-5, 5))
plot_distribution_curves(axes[0, 2], all_segments,'sensor_3_fft_imag_skew', ())
plot_distribution_curves(axes[1, 0], all_segments,'sensor_2_sum', (-1, 1))
plot_distribution_curves(axes[1, 1], all_segments,'sensor_1_fft_imag_median', (-1, 1))
plot_distribution_curves(axes[1, 2], all_segments,'sensor_1_fft_imag_skew', (-1, 1))

## Important Adversarial Sensors
Is there a particular sensor that is responsible for most of the descrimination between train and test. We also want to see the sensors which create the least important features because those sensors might have less differences between train and test. Here it looks like there is pretty even between sensors with **sensor 10** being the only outstanding problematic sensor 

In [None]:
feature_importances_by_sensor = pd.DataFrame()
feature_importances_by_sensor['features'] = columns
feature_importances_by_sensor['feature_importances'] = lgbmc.feature_importances_
feature_importances_by_sensor['sensor'] = feature_importances_by_sensor['features'].apply(lambda r: r.split('_')[1])
feature_importances_by_sensor = feature_importances_by_sensor[feature_importances_by_sensor['sensor'] != 'A0']
feature_importances_by_sensor['sensor'] = feature_importances_by_sensor['sensor'].astype(int)
feature_importances_by_sensor = feature_importances_by_sensor.groupby(['sensor'], as_index=False).agg({'feature_importances' : 'sum'})
fig, ax = plt.subplots(figsize = (20, 10))
sns.set()
plt.subplot(1, 1, 1);
sns.barplot(x="sensor", y="feature_importances", orient='v', data=feature_importances_by_sensor);
plt.title('Feature Importances By Sensor'); 

## Important Adversarial Feature Functions
Are there aggregation functions that were used to create the features. It appears that sum of the fast-fourier transform features are somewhat problematic in this particular instance.

In [None]:
feature_importances_by_function = pd.DataFrame()
feature_importances_by_function['features'] = columns
feature_importances_by_function['feature_importances'] = lgbmc.feature_importances_
feature_importances_by_function['function'] = feature_importances_by_function['features'].apply(lambda r: '_'.join(r.split('_')[2:]))
feature_importances_by_function = feature_importances_by_function.groupby(['function'], as_index=False).agg({'feature_importances' : 'sum'})
feature_importances_by_function = feature_importances_by_function.sort_values(['feature_importances'], ascending=False).head(30)
fig, ax = plt.subplots(figsize = (20, 10))
sns.set()
plt.subplot(1, 1, 1);
sns.barplot(x="function", y="feature_importances", orient='v', data=feature_importances_by_function);
plt.xticks(rotation=45, ha='right')
plt.title('Feature Importances By Function'); 

## High confidence test predictions

Here we look at the distribution of probabilities output from the model. We can see that there is a gaussian distribution around `0.4` and a spike at around `0.95`. Since our model is calibrated from using the `binary_logloss` metric we know that if we look at the subset of 95% confident predictions roughly 95/100 of those segments will be true test segments. We should try and determine what make those segments similar, why are they different from the others. **Are these the segments that are the key to the competition?** Which segments are these? What can we do about them? **And what can we do with the train segments to account for these?**

In [None]:
fig = plt.figure(figsize=(20, 12))
all_probabilities = np.append(
    cv_scores['fold_0_predict_proba'].to_numpy(),
    [cv_scores['fold_1_predict_proba'].to_numpy(),
    cv_scores['fold_2_predict_proba'].to_numpy(),
    cv_scores['fold_3_predict_proba'].to_numpy(),
    cv_scores['fold_4_predict_proba'].to_numpy()]
)

sns.distplot(all_probabilities, 100)

In [None]:
def append_cv_scores(cv_scores, postfix):
    return np.append(
    cv_scores[f'fold_0_{postfix}'].to_numpy(),
    [cv_scores[f'fold_1_{postfix}'].to_numpy(),
    cv_scores[f'fold_2_{postfix}'].to_numpy(),
    cv_scores[f'fold_3_{postfix}'].to_numpy(),
    cv_scores[f'fold_4_{postfix}'].to_numpy()]
)

all_segment_probabilities = pd.DataFrame()
all_segment_probabilities['segment_id'] = append_cv_scores(cv_scores, 'segment_id')
all_segment_probabilities['predict_proba'] = append_cv_scores(cv_scores, 'predict_proba')
all_segment_probabilities['val'] = append_cv_scores(cv_scores, 'val')
all_segment_probabilities = all_segment_probabilities.sort_values(['predict_proba'], ascending=False)
all_segment_probabilities.head(10)

## High Confidence test predictions similarities
So there are no train segments that fall into the high confidence bucket. It looks like the defining characteristic is that `sensor_10` is missing in those training sets, and `sensor_5`, `sensor_9`, and `sensor_2` is often missing in these training sets as well. Not the most interesting insight, but still a useful observation none the less.

- `sensor_10` missing `100%` of the time
- `sensor_9` missing `47.7%` of the time
- `sensor_5` missing `63.3%` of the time
- `sensor_2` missing `45.3%` of the time

In [None]:
all_segment_probabilities_gt_80 = all_segment_probabilities[all_segment_probabilities['predict_proba'] > .80]
all_segment_probabilities_gt_80.groupby(['val'], as_index=False).agg({'segment_id' : 'count'})

In [None]:
missing_sensors_gt_80 = test[test.segment_id.isin(all_segment_probabilities_gt_80.segment_id)].copy()
for i in range(1, 11):
    missing_sensors_gt_80[f'sensor_{i}_missing'] = np.where(missing_sensors_gt_80[f'sensor_{i}_sum'] == 0, 1, 0)

missing_sensors_gt_80.agg({
    'sensor_1_missing'  : 'sum',
    'sensor_2_missing'  : 'sum',
    'sensor_3_missing'  : 'sum',
    'sensor_4_missing'  : 'sum',
    'sensor_5_missing'  : 'sum',
    'sensor_6_missing'  : 'sum',
    'sensor_7_missing'  : 'sum',
    'sensor_8_missing'  : 'sum',
    'sensor_9_missing'  : 'sum',
    'sensor_10_missing' : 'sum',
    'segment_id' : 'count'})

# Conclusion

There are a set of segments in the test segment that are clearly identifiable. They don't have corresponding train segments that are similar, they might be driving the CV/LB differences, we don't have good proxies in train, we might have to create them. I have included a list of the segment_ids in the file `test_segments_adversarial_validation_prob_gt_80.csv` in the output of this notebook, so that others might explore.

In [None]:
all_segment_probabilities_gt_80.to_csv('./test_segments_adversarial_validation_prob_gt_80.csv', header=True, index=False)
all_segment_probabilities.to_csv('./all_segments_adversarial_validation_prob.csv', header=True, index=False)

## Check the distributions of our previous predictions
So let's check what the distribution of predictions look like for the segments that look like train, and ones that don't look like train, and overall. We see that the unidentifiable is quite concentrated at around `27540221.376374934` (the median), with a mean of `26917173.260142826`. These are quite close to the predicted test `time_to_eruption`mean `24073643.301489953` and the train `time_to_eruption` mean `22848906.832769126`.

In [None]:
predictions = pd.read_csv('../input/ingv-volcanic-eruption-prediction-lgbm-baseline/submission_recent.csv')
train = pd.read_csv('../input/ingv-volcanic-eruption-prediction-lgbm-baseline/train.csv')
predictions_gt_80 = predictions[predictions.segment_id.isin(all_segment_probabilities_gt_80.segment_id)]
predictions_lt_80 = predictions[~predictions.segment_id.isin(all_segment_probabilities_gt_80.segment_id)]

predictions_gt_80_sensor10_null = predictions[predictions.segment_id.isin(missing_sensors_gt_80[missing_sensors_gt_80[f'sensor_10_sum'] == 0].segment_id)]
predictions_gt_80_sensor9_null = predictions[predictions.segment_id.isin(missing_sensors_gt_80[missing_sensors_gt_80[f'sensor_9_sum'] == 0].segment_id)]
predictions_gt_80_sensor5_null = predictions[predictions.segment_id.isin(missing_sensors_gt_80[missing_sensors_gt_80[f'sensor_5_sum'] == 0].segment_id)]
predictions_gt_80_sensor2_null = predictions[predictions.segment_id.isin(missing_sensors_gt_80[missing_sensors_gt_80[f'sensor_2_sum'] == 0].segment_id)]

train_lt_80_sensor10_null = train[train[f'sensor_10_sum'] == 0]
train_lt_80_sensor9_null = train[train[f'sensor_9_sum'] == 0]
train_lt_80_sensor5_null = train[train[f'sensor_5_sum'] == 0]
train_lt_80_sensor2_null = train[train[f'sensor_2_sum'] == 0]

In [None]:
plt.figure(figsize=(20, 10))

sns.distplot(predictions_gt_80['time_to_eruption'], 50)
sns.distplot(predictions['time_to_eruption'], 50)
sns.distplot(predictions_lt_80['time_to_eruption'], 50)
sns.distplot(train['time_to_eruption'], 50)
plt.title('Distribution of predictions for Test segments - Blue: Test identifiable, Orange: Test all, Green: Test unidentifiable, Pink: Train time to eruption')
plt.legend()

# A Guess...
What is happening here... Well I'm guessing it's one of a couple things. Some hyptheses

1. Sensor 10, 9, and 5 are sensitive. They fail intermittently and the researchers are going out to fix them or put them online again
2. Sensors 10, 9, 5 failed at different points in time and were fixed once or twice.
3. Sensors 10, 5 failed once for a while, sensors 9 and 2 failed intermittently once or twice

We can test these hypotheses by looking at the distributions of the predicted `time_to_eruption` of the test versus the predicted `time_to_eruption` of the train. In the below graph, Blue is the unidentifiable test, and orange is train. Looking at those side by side it seems like our theory number 1 and 3 are correct. Sensors 9 and 10 seem to fail intermittently while sensors 5 and 2 fail once for a while.

Using the below method let's assume our predictions are directionally correct. The distributions do not match however, let's see if we can build a regression model to remap these predictions so that the failure timelines match between test and train.

In [None]:
fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(20, 12))

sns.distplot(predictions_gt_80_sensor10_null['time_to_eruption'], 50, ax = axes[0,0])
sns.distplot(train_lt_80_sensor10_null['time_to_eruption'], 50, ax = axes[0,0])
axes[0,0].set_title('Identifiable Test vs. Train time_to_eruption sensor 10 null')
sns.distplot(predictions_gt_80_sensor9_null['time_to_eruption'], 50, ax = axes[0,1])
sns.distplot(train_lt_80_sensor9_null['time_to_eruption'], 50, ax = axes[0,1])
axes[0,1].set_title('Identifiable Test vs. Train time_to_eruption sensor 9 null')
sns.distplot(predictions_gt_80_sensor5_null['time_to_eruption'], 50, ax = axes[1,0])
sns.distplot(train_lt_80_sensor5_null['time_to_eruption'], 50, ax = axes[1,0])
axes[1,0].set_title('Identifiable Test vs. Train time_to_eruption sensor 5 null')
sns.distplot(predictions_gt_80_sensor2_null['time_to_eruption'], 50, ax = axes[1,1])
sns.distplot(train_lt_80_sensor2_null['time_to_eruption'], 50, ax = axes[1,1])
axes[1,1].set_title('Identifiable Test vs. Train time_to_eruption sensor 2 null')

## Simple regression to remap distributions

Let's just reset here. I want to use the `time_to_eruption`s from my best submission `6e6` to do this. We are going to use the distributions from the train datasets to remap the distributions for the identifiable train `time_to_eruption` predictions. We are going to go backwards by the number observations we have in train: `sensor_2`, `sensor_5`, `sensor_9`, `sensor_10`.

We can use a simple neural network to build these non-linear mappings, we then substitute the new predictions back into the predictions dataset. We can see finally that our distributions of `time_to_eruption` better matches the unidentifiable test dataset and the train dataset.

In [None]:
# Get the best scoring predictions
predictions = pd.read_csv('../input/ignv-submission-6e6/submission_best.csv')

# Get train and test again
train = pd.read_csv('../input/ingv-volcanic-eruption-prediction-lgbm-baseline/train.csv')
test = pd.read_csv('../input/ingv-volcanic-eruption-prediction-lgbm-baseline/test.csv')

# Split out identifiable and unidentifiable test segments and their predictions
predictions_gt_80 = predictions[predictions.segment_id.isin(all_segment_probabilities_gt_80.segment_id)]
predictions_lt_80 = predictions[~predictions.segment_id.isin(all_segment_probabilities_gt_80.segment_id)]

# Build function to return to us the identifiable test segments by null sensors, and the train segments by null sensors
def segments_by_null_sensors(predictions_gt_80, test, train, sensor):
    predictions_null_sensors = predictions_gt_80[predictions_gt_80.segment_id.isin(test[test[f'{sensor}_sum'] == 0].segment_id)]
    train_predictions_null_sensors = train[train[f'{sensor}_sum'] == 0]
    train_predictions_null_sensors = train_predictions_null_sensors[['segment_id', 'time_to_eruption']].copy()
    return predictions_null_sensors, train_predictions_null_sensors

In [None]:
from sklearn.neighbors import KernelDensity

def get_regression_dataset(predictions_null_sensors, train_predictions_null_sensors):

    # Build Kernel density estimators
    kde_predictions = KernelDensity(kernel='gaussian', bandwidth=0.75) \
                                   .fit(predictions_null_sensors['time_to_eruption'].to_numpy()[:, np.newaxis])
    kde_train = KernelDensity(kernel='gaussian', bandwidth=0.75) \
                             .fit(train_predictions_null_sensors['time_to_eruption'].to_numpy()[:, np.newaxis])
    
    # Build KDE samples
    kde_predictions_sample = kde_predictions.sample(100000, 42).flatten()
    kde_train_sample = kde_train.sample(100000, 42).flatten()
    
    quantiles = []
    kde_predictions_sample_quantile = []
    kde_train_sample_quantile = []
    for quantile in np.linspace(0, 1, 1000):
        quantiles.append(quantile)
        kde_predictions_sample_quantile.append(np.quantile(kde_predictions_sample, quantile))
        kde_train_sample_quantile.append(np.quantile(kde_train_sample, quantile))
    
    pdf_mapping_df = pd.DataFrame()
    pdf_mapping_df['quantiles'] = quantiles
    pdf_mapping_df['predictions_tte'] = kde_predictions_sample_quantile
    pdf_mapping_df['train_tte'] = kde_train_sample_quantile
    
    return pdf_mapping_df

predictions_null_sensor_2, train_predictions_null_sensor_2 = segments_by_null_sensors(predictions_gt_80, test, train, 'sensor_2')
pdf_mapping_df_sensor_2 = get_regression_dataset(predictions_null_sensor_2, train_predictions_null_sensor_2)

In [None]:
sns.scatterplot(x= pdf_mapping_df_sensor_2['predictions_tte'], y = pdf_mapping_df_sensor_2['train_tte'])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import OrderedDict

def training_loop(n_epochs, optimizer, model, loss_fn, t_u_train, t_u_val,
                  t_c_train, t_c_val):
    for epoch in range(1, n_epochs + 1):
        t_p_train = model(t_u_train)
        loss_train = loss_fn(t_p_train, t_c_train)
        t_p_val = model(t_u_val)
        loss_val = loss_fn(t_p_val, t_c_val)
        optimizer.zero_grad()
        loss_train.backward()
        optimizer.step()

        if epoch == 1 or epoch % 10000 == 0:
            print(f"Epoch {epoch}, Training loss {loss_train.item():.4f},"f" Validation loss {loss_val.item():.4f}")
            

def pdf_mapping_model(pdf_mapping_df, predictions_null_sensor, sensor):
    pdf_mapping_df_pred_tte_max_scaled = pdf_mapping_df['predictions_tte'].copy().to_numpy() / pdf_mapping_df['predictions_tte'].copy().max()
    pdf_mapping_df_train_tte_max_scaled = pdf_mapping_df['train_tte'].copy().to_numpy() / pdf_mapping_df['predictions_tte'].copy().max()

    print (f'Building model for {sensor}')
    # Neural Network to remap predictions based on PDF
    seq_model = nn.Sequential(
    OrderedDict([('hidden_linear1', nn.Linear(1, 8)),
                 ('hidden_activation1', nn.Tanh()),
                 ('hidden_linear2', nn.Linear(8, 8)),
                 ('hidden_activation2', nn.Tanh()),
                 ('output_linear', nn.Linear(8, 1))])
    )
    optimizer = optim.SGD(seq_model.parameters(), lr=1e-2)
    
    training_loop(
        n_epochs = 20000,
        optimizer = optimizer,
        model = seq_model.double(),
        loss_fn = nn.MSELoss(),
        t_u_train = torch.tensor(pdf_mapping_df_pred_tte_max_scaled, dtype=torch.double).unsqueeze(1),
        t_u_val = torch.tensor(pdf_mapping_df_pred_tte_max_scaled, dtype=torch.double).unsqueeze(1),
        t_c_train = torch.tensor(pdf_mapping_df_train_tte_max_scaled, dtype=torch.double).unsqueeze(1),
        t_c_val = torch.tensor(pdf_mapping_df_train_tte_max_scaled, dtype=torch.double).unsqueeze(1)
    )
    
    # Results
    predictions_null_sensor_max_scaled = predictions_null_sensor['time_to_eruption'].copy().to_numpy() / pdf_mapping_df['predictions_tte'].copy().max()
    
    predictions_null_sensor['time_to_eruption_remapped'] = seq_model(
                    torch.tensor(predictions_null_sensor_max_scaled, dtype=torch.double).unsqueeze(1)
                    ).detach().numpy() * pdf_mapping_df['predictions_tte'].copy().max()
    
    return predictions_null_sensor

predictions_null_sensor_2 = pdf_mapping_model(pdf_mapping_df_sensor_2, predictions_null_sensor_2, 'sensor_2')

In [None]:
sns.distplot(predictions_null_sensor_2['time_to_eruption_remapped'])
sns.distplot(predictions_null_sensor_2['time_to_eruption'])
sns.distplot(train_predictions_null_sensor_2['time_to_eruption'])

In [None]:
predictions_null_sensor_5, train_predictions_null_sensor_5 = segments_by_null_sensors(predictions_gt_80, test, train, 'sensor_5')
pdf_mapping_df_sensor_5 = get_regression_dataset(predictions_null_sensor_5, train_predictions_null_sensor_5)
predictions_null_sensor_5 = pdf_mapping_model(pdf_mapping_df_sensor_5, predictions_null_sensor_5, 'sensor_5')

predictions_null_sensor_9, train_predictions_null_sensor_9 = segments_by_null_sensors(predictions_gt_80, test, train, 'sensor_9')
pdf_mapping_df_sensor_9 = get_regression_dataset(predictions_null_sensor_9, train_predictions_null_sensor_9)
predictions_null_sensor_9 = pdf_mapping_model(pdf_mapping_df_sensor_9, predictions_null_sensor_9, 'sensor_9')

predictions_null_sensor_10, train_predictions_null_sensor_10 = segments_by_null_sensors(predictions_gt_80, test, train, 'sensor_10')
pdf_mapping_df_sensor_10 = get_regression_dataset(predictions_null_sensor_10, train_predictions_null_sensor_10.reset_index(drop=True).iloc[1:])
predictions_null_sensor_10 = pdf_mapping_model(pdf_mapping_df_sensor_10, predictions_null_sensor_10, 'sensor_10')

In [None]:
sns.distplot(predictions_null_sensor_10['time_to_eruption_remapped'])
sns.distplot(predictions_null_sensor_10['time_to_eruption'])
sns.distplot(train_predictions_null_sensor_10.reset_index(drop=True).iloc[1:]['time_to_eruption'])

In [None]:
predictions_null_sensor_2 = predictions_null_sensor_2.rename(columns={'time_to_eruption_remapped' : 'time_to_eruption_remapped_sensor_2'})
predictions_null_sensor_5 = predictions_null_sensor_5.rename(columns={'time_to_eruption_remapped' : 'time_to_eruption_remapped_sensor_5'})
predictions_null_sensor_9 = predictions_null_sensor_9.rename(columns={'time_to_eruption_remapped' : 'time_to_eruption_remapped_sensor_9'})
predictions_null_sensor_10 = predictions_null_sensor_10.rename(columns={'time_to_eruption_remapped' : 'time_to_eruption_remapped_sensor_10'})

predictions = pd.merge(predictions, predictions_null_sensor_2[['segment_id', 'time_to_eruption_remapped_sensor_2']], 
                       on=['segment_id'], how='left')
predictions = pd.merge(predictions, predictions_null_sensor_5[['segment_id', 'time_to_eruption_remapped_sensor_5']], 
                       on=['segment_id'], how='left')
predictions = pd.merge(predictions, predictions_null_sensor_9[['segment_id', 'time_to_eruption_remapped_sensor_9']], 
                       on=['segment_id'], how='left')
predictions = pd.merge(predictions, predictions_null_sensor_10[['segment_id', 'time_to_eruption_remapped_sensor_10']], 
                       on=['segment_id'], how='left')
predictions.head(10)

In [None]:
# predictions['substitute_time_to_eruption'] = np.where(predictions['time_to_eruption_remapped_sensor_10'].notnull(),
#                                                       predictions['time_to_eruption_remapped_sensor_10'],
#                                                       predictions['time_to_eruption'])

predictions['new_time_to_eruption'] = np.where(predictions['time_to_eruption_remapped_sensor_10'].isnull(), predictions['time_to_eruption'], 
                                               predictions['time_to_eruption_remapped_sensor_10'])

predictions.to_csv('./predictions_all_mappings.csv', header=True, index=False)
predictions[['segment_id', 'new_time_to_eruption']].rename(columns={'new_time_to_eruption' : 'time_to_eruption'}) \
            .to_csv('./new_predictions.csv', header=True, index=False)

In [None]:
plt.figure(figsize=(20, 10))

sns.distplot(predictions['new_time_to_eruption'], 50)
sns.distplot(predictions['time_to_eruption'], 50)
sns.distplot(predictions_lt_80['time_to_eruption'], 50)
sns.distplot(train['time_to_eruption'], 50)
plt.title('Distribution of predictions for Test segments - Blue: Test identifiable, Orange: Test all, Green: Test unidentifiable, Pink: Train time to eruption')
plt.legend()

# Final Results
Note that our preidction distribution above is much flatter and closer to the `time_to_eruption` distribution for the unidentifiable train. Using this distribution remapping for the indentifiable train parameters I reduce the score from `6e6` -> `5.8e6`. Again, please note I am only using the features from my original notebook.

# Euclidean Distances between Train Segments

It is clear that there are overlapping segments that are not similar. Let's assume that there are then multiple eruptions, if this is true, we should be able to find segments that came from the same eruption and segments that do not come from the same eruption. We can try and find these segments by first finding segments that overlap, these cannot be from the same eruption. Second we calculate the 10 nearest (in time to eruption) of the non-overlapping segments. From there we calculate the euclidean distance between those segments and finally we construct a wieghted graph of the train segments, where the weight is the euclidean distance between two segments.

The graph is saved in a pickle file in the output of this notebook.

In [None]:
train['start_time'] = train['time_to_eruption'] + 60000

In [None]:
overlapping_segments = {}
ten_closest_non_overlapping_segments = {}

train = train.sort_values(['time_to_eruption']).reset_index(drop=True)

def go_backwards(
    i, train, 
    segments_overlapping_segments):
    
    j = i-1
    while(j >= 0):
        if (abs(train.iloc[j]['time_to_eruption'] - train.iloc[i]['time_to_eruption']) <= 60001):
            segments_overlapping_segments.append(train.iloc[j]['segment_id'])
            j = j-1
        else:
            break
    return segments_overlapping_segments


def go_forwards(
    i, train, 
    segments_overlapping_segments):
    
    j = i+1
    while(j < train.shape[0]):
        if (abs(train.iloc[j]['time_to_eruption'] - train.iloc[i]['time_to_eruption']) <= 60001):
            segments_overlapping_segments.append(train.iloc[j]['segment_id'])
            j = j+1
        else:
            break
    return segments_overlapping_segments


for i, row in train.iterrows():
    segments_overlapping_segments = []
    
    segments_overlapping_segments = go_backwards(
                                        i, train, 
                                        segments_overlapping_segments
                                    )
    segments_overlapping_segments = go_forwards(
                                        i, train, 
                                        segments_overlapping_segments
                                    )
    ten_closest = train[(~train.segment_id.isin(segments_overlapping_segments)) & (train.segment_id != train.iloc[i].segment_id)].copy()
    ten_closest['closeness_by_tte'] = np.abs((ten_closest['time_to_eruption'] - train.iloc[i]['time_to_eruption']).to_numpy())
    segments_ten_closest_non_overlapping = list(ten_closest.sort_values(['closeness_by_tte']).head(10)['segment_id'])
    
    overlapping_segments[train.iloc[i].segment_id] = segments_overlapping_segments
    ten_closest_non_overlapping_segments[train.iloc[i].segment_id] = segments_ten_closest_non_overlapping

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
import networkx as nx


train_segment_similarity = pd.DataFrame()
train_segment_similarity['segment_id'] = train.segment_id
train_segment_similarity = train_segment_similarity.set_index('segment_id')
#train_segment_similarity.columns = train.index

k = 0
train_segment_similarity_graph = nx.Graph()
for i, row in train_segment_similarity.iterrows():
    for e, j in enumerate(ten_closest_non_overlapping_segments[i]):
        x = train.set_index('segment_id').loc[i].drop(['time_to_eruption', 'start_time']).to_numpy()
        y = train.set_index('segment_id').loc[j].drop(['time_to_eruption', 'start_time']).to_numpy()
        distance = euclidean_distances(x.reshape(1, -1),y.reshape(1, -1)).flatten()[0]
        train_segment_similarity.loc[i, e + 1] = distance
        
        # Build Graph
        if (i not in train_segment_similarity_graph.nodes):
            train_segment_similarity_graph.add_node(i)
        if (j not in train_segment_similarity_graph.nodes):
            train_segment_similarity_graph.add_node(j)
            
        train_segment_similarity_graph.add_weighted_edges_from(
            [(i, j, distance)]
        )

In [None]:
train_segment_similarity.head(5)

In [None]:
plt.figure(figsize=(20, 15))

nx.draw(train_segment_similarity_graph, with_labels=False) 

In [None]:
nx.write_gpickle(train_segment_similarity_graph,"train_segment_similarity_graph.gpickle")

In [None]:
train_segment_similarity_graph

# Final Thoughts

- If there are multiple eruptions the holdout set may very contain an eruption sequence we have not seen in the train data, one were sensor 10 failed for a significant amount of time. This might then also explain the overfitting we are seeing. We overfit on individual eruption sequences that are hidden in our train data. Once we are presented with a new eruption sequence in test our scores go from `2.6e6` -> `6e6`. My guess is that there are about 6-7 individual eruptions in train

- If you can identify the segments that belong to the same eruption sequence you should CV by holding out an eruption sequence or, even better, time series cross-validate over each subsequent eruption sequence. However, the latter might be difficult because you would then have to determine ordering of the identified eruption sequences

- A good CV model should have a local CV close to the LB CV and it should be higher than `2.6e6` but I imagine that the LB scores will not directly match because the test set does have those segments that are quite unusual. It will be interesting to see how people manage to deal with those.