# Improve the train-test split with the hashing function

## Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from zlib import crc32

## Illustrating the issue

In [2]:
# generate the first DataFrame
X_1 = pd.DataFrame(data={"variable": np.random.normal(size=1000)})

# apply the train-test split
X_1_train, X_1_test = train_test_split(X_1, test_size=0.2, random_state=42)

# add new observations to the DataFrame
X_2 = pd.concat([X_1, pd.DataFrame(data={"variable": np.random.normal(size=500)})]).reset_index(drop=True)

# again, apply the train-test split to the updated DataFrame
X_2_train, X_2_test = train_test_split(X_2, test_size=0.2, random_state=42)

# see what is the overlap of indices
print(f"Train set: {len(set(X_1_train.index).intersection(set(X_2_train.index)))}")
print(f"Test set: {len(set(X_1_test.index).intersection(set(X_2_test.index)))}")


Train set: 669
Test set: 59


## Solving the issue

In [3]:
def hashed_train_test_split(df, index_col, test_size=0.2):
    """
    Train-test split based on the hash of the unique identifier.
    """
    test_index = df[index_col].apply(lambda x: crc32(np.int64(x)))
    test_index = test_index < test_size * 2**32

    return df.loc[~test_index], df.loc[test_index]


In [4]:
# create an index column (should be immutable and unique)
X_1 = X_1.reset_index(drop=False)
X_2 = X_2.reset_index(drop=False)

# apply the improved train-test split
X_1_train_hashed, X_1_test_hashed = hashed_train_test_split(X_1, "index")
X_2_train_hashed, X_2_test_hashed = hashed_train_test_split(X_2, "index")

# see what is the overlap of indices
print(f"Train set: {len(set(X_1_train_hashed.index).intersection(set(X_2_train_hashed.index)))}")
print(f"Test set: {len(set(X_1_test_hashed.index).intersection(set(X_2_test_hashed.index)))}")

Train set: 800
Test set: 200
