# This notebook generates target samples as described in [this discussion](https://www.kaggle.com/c/commonlitreadabilityprize/discussion/257470)

The notebook will output a csv file with two versions of target samples

To use them in your training notebook:  
1. read in the output csv file instead of the commonlit train.csv  
2. when you read in the csv file, use `pd.read_csv(<path_to_output_csv>, converters={'samples': eval, 'samples2': eval})`  
3. in the \_\_get\_\_item() in your pytorch dataset, after `target = self.target[index]`, add `target = target[np.random.randint(1,100,1)[0]]`
                

In [None]:
import numpy as np
import pandas as pd
import scipy.stats
import random

In [None]:
# helper for "generate_samples" function
def probability_of_value(value, dist):
    dist_mean = dist.mean()
    if value > dist_mean:
        prob = (1 - dist.cdf(value)) * 2
    elif value <= dist_mean:
        prob = dist.cdf(value) * 2
    
    return prob

In [None]:
# helper for "sample_with_standard_error_noise" function
def generate_samples(
    target,
    standard_error,
    standard_error_range,
    all_targets,
    sample_num,
    random_noise
):

    # this will hold the samples to return
    valid_samples = []
    
    # edge case
    if target == 0.0:
        return [0.0 for i in range(sample_num)]
    
    # get the limits, and filter targets to that
    bottom_limit = target - (standard_error * standard_error_range)
    top_limit = target + (standard_error * standard_error_range)
    targets_in_range = all_targets[(all_targets >= bottom_limit) & (all_targets <= top_limit)]
    num_targets = len(targets_in_range) // 3
    
    # create distribution
    dist = scipy.stats.norm(target, standard_error)
    
    # now we create them
    while len(valid_samples) < sample_num:
        # just any samples from the range
        samples = list(targets_in_range.sample(num_targets))
        for sample in samples:
            # ok how probable is it?
            sample_prob = probability_of_value(sample, dist)
            # should we keep it?
            if sample_prob > random.random():
                random_noise_div = 1.0 / random_noise
                valid_samples.append(sample+(random.random()/random_noise_div-(random_noise/2)))
    
    return valid_samples[:sample_num]

In [None]:
def sample_with_standard_error_noise(
    df,
    standard_error_range,
    sample_num,
    random_noise,
    target_col="target",
    standard_error_col="standard_error",
    samples_col="samples"
):
    all_targets = df[target_col].copy()
    
    working_df = df.copy()
    working_df[samples_col] = working_df.apply(
        lambda x: generate_samples(
                      x[target_col],
                      x[standard_error_col],
                      standard_error_range,
                      all_targets,
                      sample_num,
                      random_noise
                  ), axis=1
    )
    
    return working_df   

In [None]:
# read training data
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
# adding sample y values
train_df = sample_with_standard_error_noise(
    df=train_df,
    standard_error_range=0.5,
    sample_num=100,
    random_noise=0.1,
    samples_col="samples"
)
train_df = sample_with_standard_error_noise(
    df=train_df,
    standard_error_range=0.375,
    sample_num=100,
    random_noise=0.075,
    samples_col="samples2"
)

In [None]:
train_df.head(1)

In [None]:
train_df.to_csv("train_df_with_target_samples.csv", index=False)