# Train and test don't overlap

After the experience with the November competition, I wanted to know whether in the December competition train and test have the same distribution. It turns out that a LightGBM classifier easily can separate the two with high accuracy, which means that there is little to no overlap between the two regions of the feature space.

The decision surface is not linear. We see this because an SVM cannot separate train from test. The SVM reaches an accuracy of only 80 %, which is easy because 80 % of the whole data is training data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm
from cycler import cycler
from datetime import datetime
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc
from sklearn.svm import LinearSVC


In [None]:
# Read the data
train_df = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
features = [f for f in test_df.columns if f != 'Id' and f != 'Cover_Type']


Now we try to classify the data into training and test, using two different classifiers.

In [None]:
%%time
# Start by defining the classification target, 'istest'
train_df['istest'] = False
test_df['istest'] = True

# Concatenate the two datasets and shuffle
both_df = pd.concat([train_df, test_df])
both_df = both_df.sample(frac=1)

# Train and validate the two classifiers (LightGBM and LinearSVC)
score_list = []
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
for fold, (train_idx, val_idx) in enumerate(kf.split(both_df, y=both_df.istest)):
    print(f"Fold {fold}")
    X_tr = both_df.iloc[train_idx]
    X_va = both_df.iloc[val_idx]
    y_tr = X_tr.istest
    y_va = X_va.istest
    X_tr = X_tr[features]
    X_va = X_va[features]

    model = lightgbm.LGBMClassifier(n_estimators=600, learning_rate=0.7)
    model.fit(X_tr, y_tr)
    
    y_va_pred = model.predict_proba(X_va)[:,1]
    acc = accuracy_score(y_va, y_va_pred > 0.5)
    auc_ = roc_auc_score(y_va, y_va_pred)
    print(f"Accuracy {acc:.5f}  AUC {auc_:.5f} {model}")
    score_list.append((acc, auc_))
    
    plt.figure(figsize=(8, 8))
    fpr, tpr, _ = roc_curve(y_va, y_va_pred)
    plt.plot(fpr, tpr, color='r', lw=1, label=f"LGBM (auc = {auc(fpr, tpr):.5f})") # roc curve
    plt.plot([0, 1 - (acc*len(both_df) - len(test_df)) / len(train_df)], # iso-accuracy line
             [(acc*len(both_df) - len(train_df)) / len(test_df), 1], color='r', lw=1, linestyle=':')

    model2 = make_pipeline(StandardScaler(), LinearSVC(dual=False))
    model2.fit(X_tr, y_tr)
    y_va_pred2 = model2.decision_function(X_va) + 0.5 # for LinearSVC
    acc2 = accuracy_score(y_va, y_va_pred2 > 0.5)
    print(f"Accuracy {acc2:.5f}  AUC {roc_auc_score(y_va, y_va_pred2):.5f} {model2}")
    fpr, tpr, _ = roc_curve(y_va, y_va_pred2)
    plt.plot(fpr, tpr, color='orange', lw=1, label=f"LinearSVC (auc = {roc_auc_score(y_va, y_va_pred2):.5f})") # roc curve
    plt.plot([0, 1 - (acc2*len(both_df) - len(test_df)) / len(train_df)], # iso-accuracy line
             [(acc2*len(both_df) - len(train_df)) / len(test_df), 1], color='orange', lw=1, linestyle=':')
    del model2

    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
    plt.gca().set_aspect('equal')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristic")
    plt.legend(loc="lower right")
    plt.show()
    
    break # One fold is enough
    
train_df['weight'] = model.predict_proba(train_df[features])[:,1]


The histogram shows that there is very little overlap between train and test data (if the regions overlapped in feature space, the histogram areas would overlap as well). The blue area is larger than the other one because there is four times more training data than test data:

In [None]:
prop_cycle = plt.rcParams['axes.prop_cycle']
custom_cycler = cycler(color=[col + '80' for col in prop_cycle.by_key()['color']])
plt.gca().set_prop_cycle(custom_cycler)

plt.hist(model.predict_proba(X_va[features][~y_va])[:,1], bins=100, label='train')
plt.hist(model.predict_proba(X_va[features][y_va])[:,1], bins=100, label='test')
plt.legend()
plt.show()


The first tree of the model starts at Wilderness_Area3, where the difference between train and test is most conspcicuous, and this feature gets the highest importance:

In [None]:
plt.figure(figsize=(24, 24))
lightgbm.plot_tree(model.booster_, orientation='vertical', ax=plt.gca())
plt.show(9)

plt.figure(figsize=(12, 14))
lightgbm.plot_importance(model.booster_, importance_type='gain', ax=plt.gca())
plt.show()
    
