In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm_notebook as tqdm
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['train.csv', 'sample_submission.csv', 'test.csv']


The statistics of training set and test set are very similar.

However, one thing that caught my eye was the fact that the distribution of the number of unique values (across features) is significantly different between training set and test set.

It seems that the test set consists of real samples as well as synthetic samples that were generated by sampling the real samples feature distributions (These are probably the "rows which are not included in scoring").

If this is correct, then finding out which sample is synthetic, and which is real should be relatively easy task:

Given a sample, we can go over its features and check if the feature value is unique.
If at least one of the sample's features is unique, then the sample must be a real sample.
It turns out that if a given sample has no unique values then it is a synthetic sample.
(It doesn't have to be like that, but in this dataset the probability is seemingly to low that this would not be the case).



In [2]:
test_path = '../input/test.csv'

df_test = pd.read_csv(test_path)
df_test.drop(['ID_code'], axis=1, inplace=True)
df_test = df_test.values

unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

print(len(real_samples_indexes))
print(len(synthetic_samples_indexes))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


100000
100000



If the split between private and public LB sets was done before the resampling process of generating synthetic samples, then it's also possible to regenerate the two different sets.
For each synthetic sample, we can go over its features and capture those features that have only one instance in the real samples set with the same value, this instance has to be one of the samples' generators.



In [3]:
df_test_real = df_test[real_samples_indexes].copy()
df_test_unreal = df_test[synthetic_samples_indexes].copy()

In [4]:
np.savetxt('real_samples_indexes_test.csv',real_samples_indexes,delimiter=',')
np.savetxt('synthetic_samples_indexes_test.csv',synthetic_samples_indexes,delimiter=',')
np.savetxt('df_test_real.csv',df_test_real,delimiter=',')
np.savetxt('df_test_unreal.csv',df_test_unreal,delimiter=',')

In [5]:
train_path = '../input/train.csv'

df_train = pd.read_csv(train_path)
df_train.drop(['target','ID_code'], axis=1, inplace=True)
df_train = df_train.values

unique_samples = []
unique_count = np.zeros_like(df_train)
for feature in tqdm(range(df_train.shape[1])):
    _, index_, count_ = np.unique(df_train[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

print(len(real_samples_indexes))
print(len(synthetic_samples_indexes))


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


200000
0


In [6]:
df_train_real = df_train[real_samples_indexes].copy()
df_train_unreal = df_train[synthetic_samples_indexes].copy()

In [7]:
np.savetxt('real_samples_indexes_train.csv',real_samples_indexes,delimiter=',')
np.savetxt('synthetic_samples_indexes_train.csv',synthetic_samples_indexes,delimiter=',')
np.savetxt('df_train_real.csv',df_train_real,delimiter=',')
np.savetxt('df_train_unreal.csv',df_train_unreal,delimiter=',')