In [1]:
from nlinec.data.load import get_positive_data, get_all_types, get_ambiguity_index
from nlinec.data.preprocessing import get_granularity
from nlinec.utils import get_data_dir
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
gran_types = []
for i in [1, 2, 3]:
    all_types = get_all_types(granularity=i)
    all_types['granularity'] = all_types['full_type'].apply(lambda x: get_granularity(x))
    gran_types.append(all_types[all_types['granularity'] == i])

In [16]:
data = get_positive_data('augmented_train.json', explode=True)

Loading augmented_train.json: 793487it [00:12, 63032.99it/s] 


In [6]:
ambiguity_index = get_ambiguity_index()
ambiguity_index

Unnamed: 0,/person/artist,/person,/organization/company/news,/location/structure,/location/structure/government,/location,/organization,/organization/company,/organization/sports_league,/person/artist/author,...,/other/living_thing/animal,/organization/transit,/location/park,/other/language/programming_language,/location/structure/hospital,/location/transit/bridge,/location/transit/railway,/other/product/mobile_phone,/location/geograpy/island,/location/geograpy
big,643.0,671.0,643.0,622.0,613.0,622.0,643.0,643.0,30.0,30.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pentagon,48.0,48.0,48.0,117.0,48.0,117.0,48.0,48.0,0.0,48.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The,8553.0,8553.0,2615.0,0.0,0.0,0.0,2615.0,2615.0,0.0,8553.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U.S.,425.0,425.0,53.0,208.0,208.0,208.0,502.0,58.0,0.0,425.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ford,5.0,5.0,0.0,0.0,0.0,0.0,72.0,72.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lake Pend Oreille,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
four minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the built files to the build directory,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
His philosophy in life,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
def get_negative_type_candidates(entity: str, ambiguity_index: dict, granularity: int, size: int = None) -> list:
    """
    Get the negative type candidates for the specified entity at the specified granularity.

    Parameters
    ----------
    entity : str
        The entity.
    ambiguity_index : dict
        The ambiguity index.
    granularity : int
        The granularity to sample the negative type candidates at.
    size : int, optional
        The number of negative type candidates to sample, by default None (return all the negative type candidates)

    Returns
    -------
    negative_type_candidates : list
        The negative type candidates.
    """
    try:
        # Get the types at the specified granularity
        negative_type_candidates = ambiguity_index.drop(columns=[t for t in ambiguity_index.columns if not get_granularity(t) == granularity]).loc[entity]

        # Remove the entities with count > 0
        negative_type_candidates =  negative_type_candidates[negative_type_candidates == 0].index.tolist()

        if len(negative_type_candidates) == 0:
            return None
        
        if size is not None:
            return np.random.choice(negative_type_candidates, size=size, replace=False).tolist()[0]
        
        return negative_type_candidates
    except KeyError:
        return None

In [18]:
def get_negative_data(positive_file: str = 'augmented_train.json', random_state: int = None, step: int = 1000) -> pd.DataFrame:
    """
    Sample negative data based on a file full of positive NEC data and store it in a json file.

    Parameters
    ----------
    filename : str, optional
        The name of the positive json file, by default 'augmented_train.json'
    random_state : int, optional
        The random state, by default None
    step : int, optional
        The step size for sampling and logging, by default 1000

    Returns
    -------
    negative_data : pd.DataFrame
        A DataFrame containing the negative data in the 'full_type' column. May contain NaN values in case no negative type candidates were found.
    """

    random_state_suffix = f'_{random_state}' if random_state is not None else ''
    negative_file = os.path.join(get_data_dir(), 'derived', 'negative_data', f'{positive_file}{random_state_suffix}.csv')
    os.makedirs(os.path.dirname(negative_file), exist_ok=True)

    if os.path.exists(negative_file):
        print(f'Loading negative data from {negative_file}...')
        return pd.read_csv(negative_file)
    else:
        print(f'Generating negative data from {positive_file}...')
        data = get_positive_data(positive_file, explode=True)
        data['granularity'] = data['full_type'].apply(get_granularity)
        ambiguity_index = get_ambiguity_index(positive_file)
        
        if random_state is not None:
            np.random.seed(random_state)

        negative_data = data.copy()
        pbar = tqdm(range(0, len(negative_data), step), total=len(negative_data), desc='Sampling negative data')
        for i in range(0, len(negative_data), step):
            negative_data.loc[i:i+step, 'full_type'] = negative_data.loc[i:i+step, ['mention_span', 'granularity']].apply(lambda x: get_negative_type_candidates(x['mention_span'], ambiguity_index, x['granularity'], size=1), axis=1)
            pbar.update(step)

        print(f'Storing negative data in {negative_file}...')
        negative_data.to_csv(negative_file, index=False)

        return negative_data

In [19]:
get_negative_data()

Generating negative data from augmented_train.json...


Loading augmented_train.json: 793487it [00:13, 58539.73it/s] 
Sampling negative data:   0%|          | 1000/1865008 [00:08<4:21:30, 118.80it/s]