# Get splits for model training

Here we split both of the cleaned MuMiN medium and large subsets in preparation for model training. We'll create two types of splits:

- Hold out, i.e., Train/test/validation splits (80:10:10)
- Cross validation, i.e., Train/test splits (85:15)

The reason for the split ratios are as follows:

- For the hold-out train/test/validation sets, this is the ratio used in the code written to train the baseline models for the Tweet classification task on
  random splits.
- For the hold-out cross validation sets, this ratio will ensure that a train/test/validation split ratio similar to the regular hold-out sets is maintained
  throughout model training when using 5-fold cross-validation.

As stated, the train/test splits will be used for K-Fold Cross-validation. The expectation here is that cross-validation will be much more effective for the subset obtained from the medium version of the MuMiN dataset since it has fewer records compared to the subset obtained from the large version.

## Load the datasets

In [1]:
from pathlib import Path
import warnings
import numpy as np
import pandas as pd

# Ignore all warnings
warnings.filterwarnings("ignore")

# Get the datasets
data_dir = Path("../data")
new_colnames = ["text", "label", "lang"]
mumin_med_ibertweet, mumin_med_ibert_hashtags, mumin_med_ibert = [pd.read_csv(f).set_axis(new_colnames, axis="columns")
    for f in data_dir.iterdir() if "mumin_medium-id_trans-indo" in f.name]
mumin_large_ibertweet, mumin_large_ibert_hashtags, mumin_large_ibert = [pd.read_csv(f).set_axis(new_colnames, axis="columns")
    for f in data_dir.iterdir() if "mumin_large-trans-indo" in f.name]

## Hold-out

Note that for each type of split we will create separate splits for each of the cleaned datasets. As a reminder, there are three different datasets:

- IndoBERT with hashtags left in tact
- IndoBERT with hashtags converted to a generic tag
- IndoBERTweet


### Get label ratios for each dataset

In [2]:
model_labels = ["IndoBERTweet", "IndoBERT", "IndoBERT with hashtags"]
mumin_medium = [mumin_med_ibertweet, mumin_med_ibert, mumin_med_ibert_hashtags]
for label, df in zip(model_labels, mumin_medium):
    print(f"Label ratio for {label} medium: {round(100 * df.label.mean(), 2)}%")

Label ratio for IndoBERTweet medium: 94.98%
Label ratio for IndoBERT medium: 94.98%
Label ratio for IndoBERT with hashtags medium: 94.98%


In [3]:
mumin_large = [mumin_large_ibertweet, mumin_large_ibert, mumin_large_ibert_hashtags] 
for label, df in zip(model_labels, mumin_large):
    print(f"Label ratio for {label} large: {round(100 * df.label.mean(), 2)}%")

Label ratio for IndoBERTweet large: 95.31%
Label ratio for IndoBERT large: 95.31%
Label ratio for IndoBERT with hashtags large: 95.31%


### Get splits

In [4]:
from sklearn.model_selection import train_test_split

# Define function to get stratified hold-out splits
def strat_holdout(df, features, target, test_size=0.3, random_state=1):
    # Get the splits
    X, y = df[features], df[target]
    X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)
    X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, stratify=y_test_val, random_state=random_state)

    # Recombine them into a single dataset
    train, test, val = [X_tmp.assign(label=y_tmp) for X_tmp, y_tmp in zip([X_train, X_test, X_val], [y_train, y_test, y_val])]

    # return X_train, X_test, X_val, y_train, y_test, y_val
    return train, test, val

# Get splits and write them to csv files
features = ["text"]
target = "label"
# test_size = 0.25
test_size = 0.2
seed = 42
output_dir = Path("../data/splits/hold_out")
model_labels = ["indobertweet", "indobert", "indobert-no_hashtags"]
# data_labels = ["x_train", "x_test", "x_val", "y_train", "y_test", "y_val"]
data_labels = ["train", "test", "val"]
for label, mmed, mlarg in zip(model_labels, mumin_medium, mumin_large):
    # Medium dataset
    for data_lab, split in zip(data_labels, list(strat_holdout(mmed, features, target, test_size, seed))):
        output_file = output_dir.joinpath(f"medium/{label}-{data_lab}.csv")
        split.to_csv(output_file, index=False)

    # Large dataset
    for data_lab, split in zip(data_labels, list(strat_holdout(mlarg, features, target, test_size, seed))):
        output_file = output_dir.joinpath(f"large/{label}-{data_lab}.csv")
        split.to_csv(output_file, index=False)

### Compare the label ratio of each split to the original dataset

In [5]:
import re

# Get medium splits
splits_dir = Path("../data/splits/hold_out/")
med_ibert = [pd.read_csv(f) for f in splits_dir.joinpath("medium").iterdir() if re.search("indobert-(?!no)", str(f))]
med_ibert_nohash = [pd.read_csv(f) for f in splits_dir.joinpath("medium").iterdir() if re.search("indobert-(?:no)", str(f))]
med_ibertweet = [pd.read_csv(f) for f in splits_dir.joinpath("medium").iterdir() if re.search("tweet", str(f))]

# Ratios for the splits for each model
split_labels = ["test", "train", "validation"]
model_labels = ["IndoBERT", "IndoBERT no hashtags", "IndoBERTweet"]
model_splits = [med_ibert, med_ibert_nohash, med_ibertweet]
for label, split in zip(model_labels, model_splits):
    print(f"{label}")
    print("=" * len(label))
    print()
    for slabel, s in zip(split_labels, split):
        print(f"{slabel.title()}: {round(100 * s.label.mean(), 2)}%")
    print()

IndoBERT

Test: 95.15%
Train: 94.98%
Validation: 94.82%

IndoBERT no hashtags

Test: 95.15%
Train: 94.98%
Validation: 94.82%

IndoBERTweet

Test: 95.15%
Train: 94.98%
Validation: 94.82%



In [6]:
# Get large splits
splits_dir = Path("../data/splits/hold_out/")
large_ibert = [pd.read_csv(f) for f in splits_dir.joinpath("large").iterdir() if re.search("indobert-(?!no)", str(f))]
large_ibert_nohash = [pd.read_csv(f) for f in splits_dir.joinpath("large").iterdir() if re.search("indobert-(?:no)", str(f))]
large_ibertweet = [pd.read_csv(f) for f in splits_dir.joinpath("large").iterdir() if re.search("tweet", str(f))]

# Ratios for the splits for each model
model_splits = [large_ibert, large_ibert_nohash, large_ibertweet]
for label, split in zip(model_labels, model_splits):
    print(f"{label}")
    print("=" * len(label))
    print()
    for slabel, s in zip(split_labels, split):
        print(f"{slabel.title()}: {round(100 * s.label.mean(), 2)}%")
    print()

IndoBERT

Test: 95.38%
Train: 95.31%
Validation: 95.27%

IndoBERT no hashtags

Test: 95.38%
Train: 95.31%
Validation: 95.27%

IndoBERTweet

Test: 95.38%
Train: 95.31%
Validation: 95.27%



## Cross-validation

### Get splits

Just get the splits since we verified in the previous section that the label ratio of each split is more or less the same.

In [7]:
# Define function to get stratified hold-out splits it preparation for cross validation
def strat_cv(df, features, target, test_size=0.3, random_state=1):
    # Get the splits
    X, y = df[features], df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

    # Recombine them into a single dataset
    train, test = [X_tmp.assign(label=y_tmp) for X_tmp, y_tmp in zip([X_train, X_test], [y_train, y_test])]

    # return X_train, X_test, X_val, y_train, y_test, y_val
    return train, test
 
# Get splits and write them to csv files
features = ["text"]
target = "label"
# test_size = 0.125
test_size = 0.15
seed = 42
output_dir = Path("../data/splits/cross_val")
model_labels = ["indobertweet", "indobert", "indobert-no_hashtags"]
data_labels = ["train", "test"]
for label, mmed, mlarg in zip(model_labels, mumin_medium, mumin_large):
    # Medium dataset
    for data_lab, split in zip(data_labels, list(strat_holdout(mmed, features, target, test_size, seed))):
        output_file = output_dir.joinpath(f"medium/{label}-{data_lab}.csv")
        split.to_csv(output_file, index=False)

    # Large dataset
    for data_lab, split in zip(data_labels, list(strat_holdout(mlarg, features, target, test_size, seed))):
        output_file = output_dir.joinpath(f"large/{label}-{data_lab}.csv")
        split.to_csv(output_file, index=False)