This notebook is a reproducible version of [columbia2131](https://www.kaggle.com/columbia2131)'s leak-free CV strategy.  
I found [original code](https://www.kaggle.com/columbia2131/jigsaw-cv-strategy-by-union-find) cannot reproduce to split data into folds due to usage of `set()`  
For reproducibility, I would like to use `pd.Series.unique()` and `np.unique()` instead in this notebook.

Reference (the original authors):
* https://www.kaggle.com/columbia2131/jigsaw-cv-strategy-by-union-find
* https://www.kaggle.com/its7171/jigsaw-cv-strategy

In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold

In [None]:
SEED = 42

In [None]:
class UnionFind():
    def __init__(self, n):
        self.n = n
        self.parents = [-1] * n

    def find(self, x):
        if self.parents[x] < 0:
            return x
        else:
            self.parents[x] = self.find(self.parents[x])
            return self.parents[x]

    def union(self, x, y):
        x = self.find(x)
        y = self.find(y)
        if x == y:
            return
        if self.parents[x] > self.parents[y]:
            x, y = y, x
        self.parents[x] += self.parents[y]
        self.parents[y] = x


def get_group_unionfind(train: pd.DataFrame):
    less_unique_text = train['less_toxic'].unique()
    more_unique_text = train['more_toxic'].unique()
    unique_text = np.hstack([less_unique_text, more_unique_text])
    unique_text = np.unique(unique_text).tolist()    
    text2num = {text: i for i, text in enumerate(unique_text)}
    num2text = {num: text for text, num in text2num.items()}
    train['num_less_toxic'] = train['less_toxic'].map(text2num)
    train['num_more_toxic'] = train['more_toxic'].map(text2num)

    uf = UnionFind(len(unique_text))
    for seq1, seq2 in train[['num_less_toxic', 'num_more_toxic']].to_numpy():
        uf.union(seq1, seq2)

    text2group = {num2text[i]: uf.find(i) for i in range(len(unique_text))}
    train['group'] = train['less_toxic'].map(text2group)
    train = train.drop(columns=['num_less_toxic', 'num_more_toxic'])
    return train

In [None]:
train = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
train_backtranslated = pd.read_csv("../input/back-translation-offline-for-data-augmentation/validation_data_bt.csv")
train = train.append(train_backtranslated, ignore_index=True)
train.head()

# Prepare Ruddit Data

In [None]:
df_ruddit = pd.read_csv("/kaggle/input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
df_ruddit = df_ruddit[df_ruddit["txt"]!="[deleted]"]
len(df_ruddit)
df_ruddit["offensiveness_score"] = (df_ruddit["offensiveness_score"] - df_ruddit["offensiveness_score"].min() )/ (df_ruddit["offensiveness_score"].max() - df_ruddit["offensiveness_score"].min() )

comment_pairs = []
for index, row in df_ruddit.iterrows():
    low_toxic_df = df_ruddit[df_ruddit["offensiveness_score"]<=(row["offensiveness_score"] - 0.3)]
#     print(low_toxic_df)
    if len(low_toxic_df)>=4:
        low_toxic = low_toxic_df.sample(n=4, random_state = index+1).reset_index(drop=True)
        comment_pairs.append((low_toxic["txt"][0], row["txt"]))
        comment_pairs.append((low_toxic["txt"][1], row["txt"]))
    more_toxic_df= df_ruddit[df_ruddit["offensiveness_score"]>=(row["offensiveness_score"] + 0.3)]
    if len(more_toxic_df)>=4:
        more_toxic =  more_toxic_df.sample(n=4, random_state = index+2).reset_index(drop=True)
        comment_pairs.append(( row["txt"],more_toxic["txt"][0]))
        comment_pairs.append(( row["txt"],more_toxic["txt"][1]))

df_ruddit_final = pd.DataFrame(comment_pairs, columns= ["less_toxic","more_toxic"])
df_ruddit_final   

# Prepare toxic classification data

In [None]:
# df_classification = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv")
# df_classification.head()

In [None]:
## Overlapping comments

### Total unique comments in severity data
# df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
# print(df_val.shape)
# tot_unique_comments = np.unique(np.concatenate([df_val["less_toxic"], df_val["more_toxic"]]))
# print("total unique: ", len(tot_unique_comments))


# # Find cases already present in toxic data

# df_val_1 = pd.merge(df_val, df_classification.loc[:,['comment_text']], 
#                   left_on = 'less_toxic', 
#                   right_on = 'comment_text', how='inner')
# # print(df_val_1.shape)

# df_val_2 = pd.merge(df_val, df_classification.loc[:,['comment_text']], 
#                   left_on = 'more_toxic', 
#                   right_on = 'comment_text', how='inner')
# # print(df_val_2.shape)

# tot_unique_common = np.unique(np.concatenate([df_val_1["comment_text"], df_val_2["comment_text"]]))
# print("total common: ", len(tot_unique_common))

# # Removing those cases
# df_classification_u = df_classification[~df_classification["comment_text"].isin(tot_unique_common)]
# print("total uncommon :", len(df_classification_u) )

In [None]:
# df_classification_u["neutral"] = 1 - df_classification_u[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].max(axis=1)
# more_toxic = df_classification_u[df_classification_u[["severe_toxic","threat", "toxic"]].max(axis=1)>=2]["comment_text"]
# less_toxic = df_classification_u[df_classification_u["neutral"]==1].sample(n = 10*len(more_toxic), random_state = SEED)
# len(less_toxic), len(more_toxic)

In [None]:
# more_toxic = more_toxic.repeat(5)

# for l_t, m_t in zip(less_toxic, more_toxic):
#     comment_pairs.append((l_t,m_t))

# Prepare Jigsaw Unintended Bias Data

In [None]:
# pd.set_option("display.max_columns",500)
# df_multi = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
# df_multi.head()

In [None]:
# df_multi["identity_associated"] = df_multi.iloc[:,8:-13].sum(axis=1)

In [None]:
# df_multi["neutral"] = df_multi[["toxic","severe_toxicity","obscene","threat","insult","identity_attack"]].sum(axis=1)==0

# more_toxic_1 = df_multi[df_multi[["severe_toxicity","threat"]].sum(axis=1)>0.2]["comment_text"]
# more_toxic_2 = df_multi[df_multi["toxic"]>=0.8]["comment_text"]
# more_toxic_3 = df_multi[df_multi["identity_attack"]>=0.8]["comment_text"]
# more_toxic = np.unique(np.concatenate([more_toxic_1, more_toxic_2, more_toxic_3]))

# less_toxic_1 = df_multi.loc[((df_multi["neutral"]==1) & (df_multi["identity_associated"]==0)),:].sample(n = 4*len(more_toxic), random_state = SEED)["comment_text"]
# less_toxic_2 = df_multi.loc[((df_multi["neutral"]==1) & (df_multi["identity_associated"]>0)),:].sample(n = len(more_toxic), random_state = SEED)["comment_text"]

# less_toxic = np.concatenate([less_toxic_1, less_toxic_2])
# len(less_toxic), 5*len(more_toxic)

In [None]:
# more_toxic = more_toxic.repeat(5)

# for l_t, m_t in zip(less_toxic, more_toxic):
#     comment_pairs.append((l_t,m_t))

In [None]:
len(comment_pairs)

# Combining all Together

In [None]:
df_2  = pd.DataFrame(comment_pairs, columns = ["less_toxic","more_toxic"])
combined_data = pd.concat([train[["less_toxic","more_toxic"]], df_2], ignore_index=True)
combined_data.info()

In [None]:
train = combined_data.sample(frac=1, random_state=SEED)

In [None]:
train.head()

In [None]:
%%time
###GET GROUP!###
train = get_group_unionfind(train)

In [None]:
train.head()

In [None]:
group_kfold = GroupKFold(n_splits=7)
for fold, (trn_idx, val_idx) in enumerate(group_kfold.split(train, train, train['group'])): 
    train.loc[val_idx , "fold"] = fold

train["fold"] = train["fold"].astype(int)
train.to_csv('train_noleak.csv', index=False)
display(train)