In order to avoid leaks, the same text needs to be put into same Folds.  
For a single document this is easy, but for a pair of documents to both be in same folds is a bit tricky.  

This simple notebook tracks pairs of text recursively to group them and try to create a leak-free Fold split.

In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold

In [None]:
n_splits=5
nrows = None

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv", nrows=nrows)
texts = set(df.less_toxic.to_list() + df.more_toxic.to_list())
text2id = {t:id for id,t in enumerate(texts)}
df['less_id'] = df['less_toxic'].map(text2id)
df['more_id'] = df['more_toxic'].map(text2id)
df

In [None]:
# Set array to store pair information
len_ids = len(text2id)
idarr = np.zeros((len_ids,len_ids), dtype=bool)

for lid, mid in df[['less_id', 'more_id']].values:
    min_id = min(lid, mid)
    max_id = max(lid, mid)
    idarr[max_id, min_id] = True

In [None]:
# Recursively retrieve the text that is paired with the text whose id is i,
# and store it's id in this_list.
# then set idarr[i, j] to False
def add_ids(i, this_list):
    for j in range(len_ids):
        if idarr[i, j]:
            idarr[i, j] = False
            this_list.append(j)
            this_list = add_ids(j,this_list)
            #print(j,i)
    for j in range(i+1,len_ids):
        if idarr[j, i]:
            idarr[j, i] = False
            this_list.append(j)
            this_list = add_ids(j,this_list)
            #print(j,i)
    return this_list

group_list = []
for i in tqdm(range(len_ids)):
    for j in range(i+1,len_ids):
        if idarr[j, i]:
            this_list = add_ids(i,[i])
            #print(this_list)
            group_list.append(this_list)

id2groupid = {}
for gid,ids in enumerate(group_list):
    for id in ids:
        id2groupid[id] = gid

df['less_gid'] = df['less_id'].map(id2groupid)
df['more_gid'] = df['more_id'].map(id2groupid)
df

In [None]:
print('unique text counts:', len_ids)
print('grouped text counts:', len(group_list))

In [None]:
# now we can use GroupKFold with group id
group_kfold = GroupKFold(n_splits=n_splits)

# Since df.less_gid and df.more_gid are the same, let's use df.less_gid here.
for fold, (trn, val) in enumerate(group_kfold.split(df, df, df.less_gid)): 
    df.loc[val , "fold"] = fold

df["fold"] = df["fold"].astype(int)
df