## 6) Generate Splits
Given the filtered data generated by "Annotated Data Filtering and Tokenization", split the data into train, val, test for predicting question relevance and exracting answers. 

In [None]:
import pandas as pd
import numpy as np
import os
from custom_tokenizer import find_start_end

In [None]:
# unlabelled data
first_df = pd.read_csv('../data/original_data_filtered.csv', index_col='index')
second_df = pd.read_csv('../data/second_data_filtered.csv', index_col='index')
third_df = pd.read_csv('../data/third_data_filtered.csv', index_col='index')

unlabelled_df = pd.concat([first_df, second_df, third_df], sort=True)

# labelled data (subset of the above)
first_df = pd.read_csv('../data/initial_MTurk_test_filled.csv', index_col='index')
second_df = pd.read_csv('../data/second_MTurk_test_filled.csv', index_col='index')

labelled_df = pd.concat([first_df, second_df], sort=True)
print(len(labelled_df))
unlabelled_df = unlabelled_df.drop(labelled_df.index)
print(len(unlabelled_df))

### Write out unlabelled data to be cleaned by the model

In [None]:
unlabelled_df = unlabelled_df[unlabelled_df.img_found == True]
dups = unlabelled_df.duplicated(subset=['post_id'], keep=False)
print("{} rows were duplicates...dropping...".format(len(dups[dups == True])))
duplicated_rows = unlabelled_df.loc[dups[dups == True].index]
unlabelled_df = unlabelled_df.drop(duplicated_rows.index)

unlabelled_df.to_csv('../data/custom_dataset_raw.csv')

### Write out labeled data to train the model

In [None]:
dev_df = pd.read_csv('../data/gold_dev.csv', index_col='index')
val_df = pd.read_csv('../data/r_relevance_val.csv', index_col='index')
df = val_df.join(dev_df, rsuffix='2')
df.q_relevant = df.apply(lambda row: row.gold_q_relevant if pd.notnull(row.gold_q_relevant) else row.q_relevant, axis=1)
df.r_relevant = df.apply(lambda row: row.gold_r_relevant if pd.notnull(row.gold_r_relevant) else row.r_relevant, axis=1)
df.answer_intersection_span = df.apply(lambda row: row.gold_answer_span if pd.notnull(row.gold_answer_span) else row.answer_intersection_span, axis=1)

In [None]:
def choose_sample(df, percentage, groupby_cols, seed=2019):
        # how many rows should be selected
        total_num = len(df)*percentage

        # how many groups there will be (i.e. how many possible combos of each col values)
        num_samples_taken = np.prod([len(df[col].unique()) for col in groupby_cols])

        # how many rows should be in each group
        num_per_sample = int(total_num/num_samples_taken)

        # sample the groups
        if groupby_cols:
            chosen = df.groupby(groupby_cols).apply(lambda x: x.sample(num_per_sample, random_state=seed)).index.levels[len(groupby_cols)]
        else:
            chosen = df.apply(lambda x: x.sample(num_per_sample, random_state=seed)).index

        return df.loc[chosen]

def generate_splits(df, groupby_cols, seed=2019, sizes=[0.1, 0.25, 0.5, 0.75]):
    splits = []
    
    df_remaining = df.copy()

    test = choose_sample(df_remaining, 0.1, groupby_cols, seed)
    df_remaining = df_remaining.drop(test.index)
    test.name = 'test'
    splits.append(test)

    val = choose_sample(df_remaining, 0.1, groupby_cols, seed)
    df_remaining = df_remaining.drop(val.index)
    val.name = 'val'
    splits.append(val)
    
    minival = choose_sample(val, 0.01, groupby_cols, seed)
    minival.name = 'minival'
    splits.append(minival)
    
    train = df_remaining
    train = train.append(duplicated_rows)
    train.name = 'train'
    splits.append(train)
    
    for size in sizes:
        train_subset = choose_sample(train, size, [], seed)
        train_subset.name = 'train_'+str(size)
        splits.append(train_subset)
        
    minitrain = choose_sample(train, 0.01, groupby_cols, seed)
    minitrain.name = 'minitrain'
    splits.append(minitrain)
    
    return splits

In [None]:
gold_dev = df[pd.notnull(df.gold_q_relevant)]
gold_dev.to_csv('../data/gold_dev.csv', index_label='index')
df = df.drop(gold_dev.index)

In [None]:
# dataset for predicting question relevance (50/50 relevant/irrelevant questions, no responses)
cols = ['q_relevant', 'question', 'response_filtered']
for split in generate_splits(df, ['q_relevant']):
    split.to_csv('../data/q_relevance_'+split.name+'.csv', index_label='index', columns=cols)

# dataset for extracting answers (50/50 relevant/irrelevant responses, where all questions are relevant)
cols = ['r_relevant', 'question', 'response_filtered', 'answer_intersection_span']
for split in generate_splits(df[df.q_relevant == True], ['r_relevant']):
    split.to_csv('../data/r_relevance_'+split.name+'.csv', index_label='index', columns=cols)

# dataset for both (TT, TF, FT, FF all even)
cols = ['q_relevant', 'r_relevant', 'question', 'response_filtered', 'answer_intersection_span']
for split in generate_splits(df, ['q_relevant', 'r_relevant']):
    split.to_csv('../data/q+r_relevance_'+split.name+'.csv', index_label='index', columns=cols)