# XOR Train Dev Test

This notebook will show and document a method of how to split former validated Mozilla Common Voice data into train, dev and test datasets so that values of the column "client_id" of one dataset is not intersecting with one of the other datasets. 

In this analysis only german language data will be processed

## Import and prepare data

Data will be imported and prepared according to \_partition_corpus_data() and \_post_process_valid_data()

Reference: https://github.com/mozilla/CorporaCreator/blob/master/src/corporacreator/corpus.py

In [1]:
import csv
import pandas as pd

# disable truncated columns
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 1000)

# use full siza of display / can be enabled on bigger screens
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:90% !important; }</style>"))

# import data
df = pd.read_csv('../corpora/de/clips.tsv', 
        sep="\t",
        parse_dates=False,
        engine="python",
        encoding="utf-8",
        error_bad_lines=False,
        quotechar='"',
        quoting=csv.QUOTE_NONE,)

# filter to german language
df_de = df[df['locale'] == "de"]

# only retain those datasets with at least 2 up or down-votes and more up than down-votes (aka "valid data")
valid = df_de.loc[ lambda df: (df.up_votes + df.down_votes > 1) & (df.up_votes > df.down_votes), : ]

# more power users in train, others in dev or test
speaker_counts = valid["client_id"].value_counts()
speaker_counts = speaker_counts.to_frame().reset_index()
speaker_counts.columns = ["client_id", "user_sentence_count"]

valid = valid.join(speaker_counts.set_index("client_id"), on="client_id")
valid = valid.sort_values(["user_sentence_count", "client_id"])
valid_tmp = valid.groupby("sentence").head(1) # 1 => multiple sentence count command line argument 
valid_tmp.sort_values(["user_sentence_count", "client_id"], ascending=False)
valid_tmp = valid_tmp.drop(columns="user_sentence_count")

## Calculate optimal split-sizes according to current applied "sample theory"

In [2]:
def sample_size(population_size):
    """Calculates the sample size.
    Calculates the sample size required to draw from a population size `population_size`
    with a confidence level of 99% and a margin of error of 1%.
    Args:
      population_size (int): The population size to draw from.
    """
    margin_of_error = 0.01
    fraction_picking = 0.50
    z_score = 2.58 # Corresponds to confidence level 99%
    numerator = (z_score**2 * fraction_picking * (1 - fraction_picking)) / (margin_of_error**2)
    denominator = 1 + (z_score**2 * fraction_picking * (1 - fraction_picking)) / (margin_of_error**2 * population_size)
    return numerator / denominator

def calculate_data_set_sizes(total_size):
    # Find maximum size for the training data set in accord with sample theory
    for train_size in range(total_size, 0, -1):
        calculated_sample_size = int(sample_size(train_size))
        if 2 * calculated_sample_size + train_size <= total_size:
            dev_size = calculated_sample_size
            test_size = calculated_sample_size
            break
    return train_size, dev_size, test_size


train_size, dev_size, test_size = calculate_data_set_sizes(len(valid_tmp))
train_size, dev_size, test_size

(2726, 2342, 2342)

## Split dataset via old/current method with regard to calculated sizes only

In [3]:
train_old = valid_tmp.iloc[0:train_size]
dev_old = valid_tmp.iloc[train_size : train_size + dev_size]
test_old = valid_tmp.iloc[train_size + dev_size : train_size + dev_size + test_size]

## Split dataset via new method according to calculated sizes

In [4]:
# calculate continous index per client_id, so we can loop over it
continous_client_index, uniques = pd.factorize(valid_tmp["client_id"])
valid_tmp["continous_client_index"] = continous_client_index

# create empty dataframes with fitting column layout
train_new = pd.DataFrame(columns=valid_tmp.columns)
dev_new = pd.DataFrame(columns=valid_tmp.columns)
test_new = pd.DataFrame(columns=valid_tmp.columns)

# iterate over continous index in reverse order so
# test dataset will be populated with clients with less entries in base dataset first
# dev dataset will be populated second
# train dataset will be populated with clients having the most entries in dataset (i.e. "power users")
for i in range(max(continous_client_index), -1, -1):
    if len(test_new) + len(valid_tmp[valid_tmp["continous_client_index"] == i]) <= test_size:
        test_new = pd.concat([test_new, valid_tmp[valid_tmp["continous_client_index"] == i]])
    elif len(dev_new) + len(valid_tmp[valid_tmp["continous_client_index"] == i]) <= dev_size:
        dev_new = pd.concat([dev_new, valid_tmp[valid_tmp["continous_client_index"] == i]])
    else:
        train_new = pd.concat([train_new, valid_tmp[valid_tmp["continous_client_index"] == i]])
    
train_new_size = len(train_new)
dev_new_size = len(dev_new)
test_new_new = len(test_new)

train_new_size, dev_new_size, test_new_new

(2728, 2342, 2342)

## Calculate dataset intersections to compare old method with new method

In [5]:
def test_client_intersections(train, dev, test):
    def indexing(df):
        df = df["client_id"].drop_duplicates().to_frame().reset_index()
        df = df.set_index("client_id")
        return df.index

    train_index = indexing(train)
    test_index = indexing(test)
    dev_index = indexing(dev)

    inter_train_test = train_index.intersection(test_index)
    inter_train_dev = train_index.intersection(dev_index)
    inter_test_dev = test_index.intersection(dev_index)

    print("{} intersecting client_id in train/test".format(len(inter_train_test)))
    print("{} intersecting client_id in train/dev".format(len(inter_train_dev)))
    print("{} intersecting client_id in test/dev".format(len(inter_test_dev)))
    
    return inter_train_test, inter_train_dev, inter_test_dev


In [6]:
test_client_intersections(train_old, dev_old, test_old)

0 intersecting client_id in train/test
0 intersecting client_id in train/dev
1 intersecting client_id in test/dev


(Index([], dtype='object', name='client_id'),
 Index([], dtype='object', name='client_id'),
 Index(['91a51969a700b958699a7242d420d6d4e62f63034f8872eafcbb1953552568c819e9670c4350bc121446d4d4d356c23f8f710a1850d5292f33907e6ce858f4eb'], dtype='object', name='client_id'))

In [7]:
test_client_intersections(train_new, dev_new, test_new)

0 intersecting client_id in train/test
0 intersecting client_id in train/dev
0 intersecting client_id in test/dev


(Index([], dtype='object', name='client_id'),
 Index([], dtype='object', name='client_id'),
 Index([], dtype='object', name='client_id'))