In [None]:
import os
from sklearn.model_selection import KFold, StratifiedKFold
import pandas as pd
import numpy as np

os.environ["WANDB_DISABLED"] = "true"

# Configurations

In [None]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    context_path = '../input/cpc-codes/'
    num_fold = 5

# Preprocessing

In [None]:
train = pd.read_csv(f"{CFG.input_path}train.csv")
titles = pd.read_csv(f"{CFG.context_path}titles.csv")

train = train.merge(titles, left_on="context", right_on="code")
# drop out unnecessary columns
train['input'] = train['title'] + '[SEP]' + train['anchor']
train = train.drop(columns=["context", "code", "class", "subclass", "group", "main_group", "anchor", "title", "section"])

# replicative sampling

In [None]:
def up_sample(dataset, target, factor):
    assert factor >= 1
    
    chosen_idx = []
    for idx in dataset.index:
        if dataset['score'][idx] == target:
            chosen_idx.append(idx)
    chosen_idx = np.random.choice(chosen_idx, (int)(len(chosen_idx)*(factor-1)))
    
    sampled_data = []
    for i in chosen_idx:
        sampled_data.append(train.iloc[i])
    sampled_data = pd.DataFrame(data=sampled_data)

    dataset = dataset.append(sampled_data)
    dataset.index = range(0, len(dataset))
    return dataset

train = up_sample(train, 1.0, 3)
train = up_sample(train, 0.75, 1.5)

# K-folds

In [None]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # the next step is to randomize the rows of the data
    # data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    # num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["score"], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    return data

train = create_folds(train, CFG.num_fold)

train.to_csv('train_5_folds.csv')