## Library of some utility functions for the [Feedback Prize - Evaluating Student Writing](https://www.kaggle.com/c/feedback-prize-2021) competition    


- Examples:
  1. Notebook [feedback-color-print](https://www.kaggle.com/sentinel1/feedback-color-print) demonstrates:
     - `read_train_csv()` function
     - `color_print_essay()` function
  2. Notebook [Feedback Prize 2021 lib - howto 2](https://www.kaggle.com/sentinel1/feedback-prize-2021-lib-howto-2):
     - `get_train_essay_text()`
     - `calc_word_indices()`
     - `stringify_word_indices()`
  3. Notebook [Feedback Prize 2021 lib - howto 3](https://www.kaggle.com/sentinel1/feedback-prize-2021-lib-howto-3):
     - `calc_PII_offsets()`
     - `get_train_df_with_fixed_PII_offsets()`
  4. Notebook [feedback-prize-2021 K-Fold Split Data](https://www.kaggle.com/sentinel1/feedback-prize-2021-k-fold-split-data):
     - `train_df_k_fold_split()`
      
      
NOTE: Comments are disabled on this notebook. If you have any comments for this library please instead comment on one of the notebooks from the above list which is the most relevant to your comments.

In [None]:
from pathlib import Path
(Path.cwd()/"lib").mkdir(exist_ok=True)
(Path.cwd()/"lib"/"__init__.py").touch()
(Path.cwd()/"__init__.py").touch()

In [None]:
%%file lib/feedback_util.py
from pathlib import Path
from IPython.display import HTML, display
import pandas as pd
import numpy as np
import string
import sys



__all__ = [
    "get_train_essay_text"
    , "color_print_essay"
    , "read_train_csv"
    , "calc_word_indices"
    , "stringify_word_indices"
    # , "calc_PII_offsets"
    , "get_train_df_with_fixed_PII_offsets"
    , "train_df_k_fold_split"
]



TRAIN_FOLDER = (Path.cwd()/'..'/'input'/'feedback-prize-2021'/'train').resolve()

def get_train_essay_text(essay_id: str) -> str:
    essay_p = TRAIN_FOLDER/f"{essay_id}.txt"
    assert essay_p.is_file(), f"No text file found for the essay with id={essay_id}"
    
    with open(essay_p, 'r') as f:
        essay_text = f.read()
    return essay_text



discourse_type_colors = {
    'Claim':                '#e6ab02',
    'Concluding Statement': '#1b9e77',
    'Counterclaim':         '#d95f02',
    'Evidence':             '#7570b3',
    'Lead':                 '#a6761d',
    'Position':             '#66a61e',
    'Rebuttal':             '#e7298a',
}

color_codes = ""
for k in discourse_type_colors:
    color_codes += f' <div style="color:{discourse_type_colors[k]};margin-right:1em;">{k}</div> '
color_codes = f'<div style="display:flex;font-weight:bold;">{color_codes}</div><hr>'



def color_print_essay(essay_id: str, df: pd.DataFrame, start_end_indicators: bool = False) -> None:
    """
    Prints single essay with color coded discourse types
    
    Parameters:
     - `essay_id`: id of the essay (as given in the `id` column of the `train.csv`)
     - `df`: dataframe (can be an original `train.csv` dataframe or any other compatible dataframe)
             containing the following columns: `id`, `discourse_start`, `discourse_end`, and `discourse_type`
     - `start_end_indicators`: optional boolean flag, defaults to False. If set to True then startings and
                               endings of each discourse are marked with ribbon arrows.
    """
    essay_df = df.loc[df['id'] == essay_id]
    assert len(essay_df) > 0, "Bad essay_id"
    essay_text = get_train_essay_text(essay_id)
    
    if start_end_indicators:
        end_indicator = '<span style="color:gray;">&#11184;</span>'
        start_indicator = '<span style="color:gray;">&#11187;</span>'
    else:
        end_indicator = start_indicator = ""
    
    for idx, row in essay_df.sort_values(by='discourse_start', ascending=False).iterrows():
        start = int(row['discourse_start'])
        end = int(row['discourse_end'])
        essay_text = essay_text[:end] + f'</span>{end_indicator}' + essay_text[end:]
        essay_text = essay_text[:start] + f'{start_indicator}<span style="color:{discourse_type_colors[row["discourse_type"]]};">' + essay_text[start:]
    display(HTML(color_codes + essay_text.replace('\n','<br>') + '<hr>'))



def read_train_csv() -> pd.DataFrame:
    """
    Reads original train.csv into pandas dataframe with adjusted data types.
    """
    return pd.read_csv((Path.cwd()/'..'/'input'/'feedback-prize-2021'/'train.csv').resolve(),
                       dtype={
                           'id':                 'string',
                           'discourse_id':       'Int64',
                           'discourse_start':    'UInt16',
                           'discourse_end':      'UInt16',
                           'discourse_text':     'string',
                           'discourse_type':     'category',
                           'discourse_type_num': 'string',
                           'predictionstring':   'string',
                       })



def calc_word_indices(full_text, discourse_start, discourse_end):
    """
    This function is a copy of the one given by the competition host.
    
    Link of the original source:
    https://www.kaggle.com/c/feedback-prize-2021/discussion/297688
    """
    start_index = len(full_text[:discourse_start].split())
    token_len = len(full_text[discourse_start:discourse_end].split())
    output = list(range(start_index, start_index + token_len))
    if output[-1] >= len(full_text.split()):
        output = list(range(start_index, start_index + token_len-1))
    return output



def stringify_word_indices(word_indices: list) -> str:
    return " ".join(str(i) for i in word_indices)



punctuation_remover = str.maketrans('', '', string.punctuation)
OFFSET_SEARCH_RANGE = np.iinfo(np.int8).max

def _calc_PII_offsets_of_essay(group, self_test = False):
    essay_id = group.name
    essay_text = get_train_essay_text(essay_id)
    
    def get_start_offset(target_discourse):
        discourse_text = target_discourse['discourse_text']
        offset = 0
        for offset_abs in range(OFFSET_SEARCH_RANGE):
            for offset_sign in [1, -1]:
                offset = offset_abs * offset_sign
                discourse_text_in_essay = essay_text[
                    target_discourse['discourse_start'] + offset:target_discourse['discourse_end'] + offset
                ]
                comparison_len = min(len(discourse_text)-1, len(discourse_text_in_essay))  # Ignoring some last characters: interchangeably used different whitespaces and extra punctuation

                if discourse_text[:comparison_len] == discourse_text_in_essay[:comparison_len]:
                    break
            else:
                continue
            break
        else:
            raise Exception(f"For the essay_id={essay_id} and discourse_id={target_discourse['discourse_id']} start offset was not found in range of {OFFSET_SEARCH_RANGE}!")
        return offset
    
    first_discourse = group.iloc[0]
    offset = get_start_offset(first_discourse)  # Here we get `offset` for the text preceeding the first labeled discourse in the essay.
                                                # So, in most cases this offset apply to the entire essay,
                                                # unless some additional PII masking are present inside the first discourse or after it.
    group['discourse_start_PII_offset'] = offset
    group['discourse_end_PII_offset'] = offset
    
    ##
    # FIXME: We are using shortcut here by taking advantage of the fact
    #        that in this training dataset most of the included PII
    #        masks are inside or before the first discourse and as a 
    #        shortcut we are adjusting offsets with one operation
    #        based on the second discourse instead of doing it for
    #        all the discourses in the loop.
    ##
    if '_NAME' in first_discourse['discourse_text'] and len(group) > 1:  # matches PII masks like: PROPER_NAME, SCHOOL_NAME, LOCATION_NAME, TEACHER_NAME, STUDENT_NAME, etc.
        second_discourse = group.iloc[1]
        new_offset = get_start_offset(second_discourse)  # Here we get `offset` for the second and the rest of discourses
                                                         # for the case when additional PII masks are present in the first discourse
        group.iloc[1:]['discourse_start_PII_offset'] = new_offset  # Start offset of the first discourse should be preserved (computed before), so skipping the first row
        group.loc[:,'discourse_end_PII_offset'] = new_offset       # End  offset  of the first discourse needs to be updated due to PII masks being included inside it, NOT skipping the first row
        # FIXME ignoring possibility of PII mask occuring in the background between the first and the second discourse, it seems there is no such case in this data, self_test below will tell.

    ##
    # FIXME: essay 'F91D7BB4277C' currently has the LOCATION_NAME
    # PII mask in the text file but it has actual location name
    # in the label, which breaks the below given self_test
    # for no good reason.
    # Fixing it by PII masking the label in the train dataframe
    # in order to match the actual text in the text file and
    # adjusting offset manually for this particular discourse.
    ##
    if essay_id == 'F91D7BB4277C':
        idx = group['discourse_id'] == 1623258656795
        discourse_text = group.loc[idx,'discourse_text'].item()
        discourse_text_fixed = discourse_text.replace('florida','LOCATION_NAME')
        end_offset_delta = len(discourse_text) - len(discourse_text_fixed)
        group.loc[idx,'discourse_text'] = discourse_text_fixed
        group.loc[idx,'discourse_end_PII_offset'] += end_offset_delta
    
    
    if self_test:
        for discourse_id, row in group.set_index('discourse_id').iterrows():
            discourse_text = row['discourse_text'].translate(punctuation_remover).strip()  # throwing away possible extra punctuations & accounting for newline and space being used interchangeably in the given data
            discourse_text_in_essay = essay_text[
                row['discourse_start'] + row['discourse_start_PII_offset']:row['discourse_end'] + row['discourse_end_PII_offset']
            ].translate(punctuation_remover).strip()
            comparison_len = min(len(discourse_text), len(discourse_text_in_essay))

            if discourse_text[:comparison_len] != discourse_text_in_essay[:comparison_len]:
                raise Exception(f"In the essay_id={essay_id}, with offset={offset} mismatch was found for the discourse_id={discourse_id}!")
    
    # Adjust dtypes to nullable int8 integer:
    group['discourse_start_PII_offset'] = group['discourse_start_PII_offset'].astype('Int8')
    group['discourse_end_PII_offset'] = group['discourse_end_PII_offset'].astype('Int8')
    return group



def calc_PII_offsets(train_df: pd.DataFrame, self_test: bool = False) -> pd.DataFrame:
    return train_df.groupby('id').apply(_calc_PII_offsets_of_essay, self_test = self_test)



PII_MASK_OFFSETTED_TRAIN_DF_CACHE = (Path.cwd()/".."/"temp"/'PII_mask_offsetted_train_df.feather').resolve()

def get_train_df_with_fixed_PII_offsets(df_with_offsets: pd.DataFrame = None, use_tmp_cache: bool = False) -> pd.DataFrame:
    """
    Fixes discourse_start and discourse_end by adding values of PII offsets to them.
    
    Parameters:
    - `df_with_offsets`: (OPTIONAL) dataframe having precomputed offset columns in it.
    Returns:
    - Dataframe with `discourse_start` and `discourse_end` corrected.
    """
    if use_tmp_cache and PII_MASK_OFFSETTED_TRAIN_DF_CACHE.is_file():
        return pd.read_feather(PII_MASK_OFFSETTED_TRAIN_DF_CACHE)
        
    if df_with_offsets is None:
        df_with_offsets = calc_PII_offsets(read_train_csv())
    offset_columns = ('discourse_start_PII_offset', 'discourse_end_PII_offset',)
    
    if all([offset in df_with_offsets.columns for offset in offset_columns]):
        df = df_with_offsets.copy()
        df['discourse_start'] += df['discourse_start_PII_offset']
        df['discourse_end'] += df['discourse_end_PII_offset']
        df = df[df.columns.difference(offset_columns, sort=False)]
        
        if use_tmp_cache:
            PII_MASK_OFFSETTED_TRAIN_DF_CACHE.parent.mkdir(exist_ok=True)
            df.to_feather(PII_MASK_OFFSETTED_TRAIN_DF_CACHE)
        return df
    
    print(f"WARNING: no offset columns ({offset_columns}) found in the df, returning unchanged df.", file=sys.stderr, flush=True)
    return df_with_offsets



try:
    from sklearn.model_selection import StratifiedGroupKFold
    
    def train_df_k_fold_split(train_df: pd.DataFrame, K: int = 5, display_split_statistics: bool = False) -> pd.DataFrame:
        """
        Splits data into K folds (i.e. adds the 'CV' column containing the fold numbers to the 'train_df' dataframe)
        """
        sgkf = StratifiedGroupKFold(n_splits=K, random_state=2022, shuffle=True)
        train_df["CV"] = 0

        for i,(train_idx, test_idx) in enumerate(sgkf.split(X=np.zeros(len(train_df)), y=train_df["discourse_type"], groups=train_df["id"]), 1):
            train_df.loc[test_idx,"CV"] = i

        if display_split_statistics:
            fold_stats_df = train_df.groupby('CV')[['id','discourse_type']].apply(lambda x:pd.DataFrame({
                'Essay': len(x['id'].unique()),
                'Discourse': len(x),
                'Claim': len(x[x['discourse_type'] == 'Claim']),
                'Concluding Statement': len(x[x['discourse_type'] == 'Concluding Statement']),
                'Counterclaim': len(x[x['discourse_type'] == 'Counterclaim']),
                'Evidence': len(x[x['discourse_type'] == 'Evidence']),
                'Lead': len(x[x['discourse_type'] == 'Lead']),
                'Position': len(x[x['discourse_type'] == 'Position']),
                'Rebuttal': len(x[x['discourse_type'] == 'Rebuttal']),
            }, index=[x.name])).reset_index('CV').set_index('CV')
            assert fold_stats_df['Essay'].sum() == len(train_df['id'].unique()), "IDs of essays in different folds must be mutually exclusive!"
            display(fold_stats_df)
        return train_df
    
except ImportError:
    
    def train_df_k_fold_split(*args, **kw):
        try:
            import sklearn
            print(f"Installed version of the `scikit-learn` package is too old: sklearn.__version__={sklearn.__version__}")
        except:
            pass
        raise Exception("The `train_df_k_fold_split()` function requires newer version of `scikit-learn`. Please update the scikit-learn package first.")
        