In [12]:
from nlinec import get_negative_data, get_positive_data
import numpy as np
import pandas as pd

In [14]:
positive_data = get_positive_data('augmented_train.json', explode=True)
positive_data

Loading augmented_train.json: 793487it [00:14, 55432.46it/s] 


Unnamed: 0,full_type,mention_span,sentence,granularity,label
0,/other,the history and science of aviation and spacef...,"Located in Washington, D.C., United States, it...",1,2
1,/other,his debut,"The seventh child of the Jackson family, he ma...",1,2
2,/location/country,We,We did not do anything at that time.,2,2
3,/location,We,We did not do anything at that time.,1,2
4,/other,antibody,`` We don't know the effect of our antibody on...,1,2
...,...,...,...,...,...
1865003,/other/art/film,his films,"As a boy, he donated his Bar Mitzvah money to ...",3,2
1865004,/other,his films,"As a boy, he donated his Bar Mitzvah money to ...",1,2
1865005,/other/art,his films,"As a boy, he donated his Bar Mitzvah money to ...",2,2
1865006,/other/art,"Japanese martial arts classes such as Aikido ,...",Sensei is often used to address the teacher in...,2,2


In [15]:
negative_data = get_negative_data(random_state=42)
negative_data

Loading negative data from /home/psaegert/Projects/nli-nec/src/nlinec/../../data/derived/negative_data/augmented_train.json_42.csv...


Unnamed: 0,full_type,mention_span,sentence,granularity,label
0,/person,the history and science of aviation and spacef...,"Located in Washington, D.C., United States, it...",1,1
1,/location,his debut,"The seventh child of the Jackson family, he ma...",1,1
2,/organization/transit,We,We did not do anything at that time.,2,1
3,/person,We,We did not do anything at that time.,1,1
4,/organization,antibody,`` We don't know the effect of our antibody on...,1,1
...,...,...,...,...,...
1865003,/other/product/car,his films,"As a boy, he donated his Bar Mitzvah money to ...",3,1
1865004,/organization,his films,"As a boy, he donated his Bar Mitzvah money to ...",1,1
1865005,/location/park,his films,"As a boy, he donated his Bar Mitzvah money to ...",2,1
1865006,/other/health,"Japanese martial arts classes such as Aikido ,...",Sensei is often used to address the teacher in...,2,1


In [5]:
negative_data['full_type'].isna().mean()

0.004637513619244528

In [18]:
def combine_positive_negative_data(positive_data: pd.DataFrame, negative_data: pd.DataFrame, frac: float = 0.5, random_state: int = None) -> pd.DataFrame:
    """
    Combine the positive and negative data by randomly replacing a fraction of the positive data with negative data or adding it to the positive data.

    Parameters
    ----------
    positive_data : pd.DataFrame
        The positive data (entailment).
    negative_data : pd.DataFrame
        The negative data (not entailment = neutral).
    frac : float, optional
        The fraction of the positive data that should be replaced with negative data, by default 0.5. If < 0, the negative data is added to the positive data.
    random_state : int, optional
        The random state for the random number generator, by default None

    Returns
    -------
    combined_data : pd.DataFrame
        The combined data.
    """
    if frac > 1:
        raise ValueError(f'frac must be <= 1, but is {frac}')

    if frac < 0:
        return pd.concat([positive_data, negative_data], ignore_index=True)
    
    # Mask for the negative data that has a full_type
    negative_type_available = negative_data['full_type'].notna()

    # Deterministic, random mask for the replacement of the positive data
    if random_state is not None:
        np.random.seed(random_state)
    random_mask = np.random.choice([True, False], size=len(positive_data), p=[frac, frac])

    # Replace the positive data with the negative data
    combined_data = positive_data.copy()
    combined_data.loc[random_mask & negative_type_available, ['full_type', 'label']] = negative_data.loc[random_mask & negative_type_available, ['full_type', 'label']].values

    return combined_data

In [22]:
combine_positive_negative_data(positive_data, negative_data, frac=0.5, random_state=42)

Unnamed: 0,full_type,mention_span,sentence,granularity,label
0,/person,the history and science of aviation and spacef...,"Located in Washington, D.C., United States, it...",1,1
1,/other,his debut,"The seventh child of the Jackson family, he ma...",1,2
2,/location/country,We,We did not do anything at that time.,2,2
3,/location,We,We did not do anything at that time.,1,2
4,/organization,antibody,`` We don't know the effect of our antibody on...,1,1
...,...,...,...,...,...
1865003,/other/art/film,his films,"As a boy, he donated his Bar Mitzvah money to ...",3,2
1865004,/other,his films,"As a boy, he donated his Bar Mitzvah money to ...",1,2
1865005,/other/art,his films,"As a boy, he donated his Bar Mitzvah money to ...",2,2
1865006,/other/health,"Japanese martial arts classes such as Aikido ,...",Sensei is often used to address the teacher in...,2,1


In [11]:
positive_data['label'].mean()

1.5020133961891853