In [None]:
import gc
gc.enable()

import sys
sys.path.append("../input/tez-lib/")

import os
import random
from tqdm import tqdm

import numpy as np
import pandas as pd
from scipy.special import softmax
import pickle

#import tez
import torch
import torch.nn as nn
from joblib import Parallel, delayed
from transformers import AutoConfig, AutoModel, AutoTokenizer

In [None]:
class Config:
    input_dir = '../input/feedback-prize-2021'
    
    model_longformer = '../input/longformerlarge4096/longformer-large-4096'
    model_led = '../input/led-large'
    model_deberta = '../input/deberta-large-download/deberta-large'
    
    max_len_test_longformer = 1600
    max_len_test_led = 1024
    
    tokenizer = '../input/deberta-large-download/deberta-large' #'../input/longformerlarge4096/longformer-large-4096'
    num_jobs = 4
    seed = 1
    
    model_ckp_path = [
        #  (kind, num_labels, model_path, weight)
        # exp lf-large, lf-base, led-large, bb-large, ...
        
        #('lf-large', 22, '../input/2022021410-lf-bie-bin/model_0.bin', 1.0),
        #('lf-large', 22, '../input/2022021410-lf-bie-bin/model_1.bin', 1.0),
        #('lf-large', 22, '../input/2022021410-lf-bie-bin/model_2.bin', 1.0),
        #('lf-large', 22, '../input/2022021410-lf-bie-bin/model_3.bin', 1.0),
        #('lf-large', 22, '../input/2022021410-lf-bie-bin/model_4.bin', 1.0),
        
        #('lf-large', 15, '../input/2022020906-aug-bin/model_0.bin', 1.0),
        #('lf-large', 15, '../input/2022020906-aug-bin/model_1.bin', 1.0),
        #('lf-large', 15, '../input/2022020906-aug-bin/model_2.bin', 1.0),
        #('lf-large', 15, '../input/2022020906-aug-bin/model_3.bin', 1.0),
        #('lf-large', 15, '../input/2022020906-aug-bin/model_4.bin', 1.0),

        #('led-large', 24, '../input/fb-exp037-led/model_0.bin', 1.0),
        #('led-large', 24, '../input/fb-exp037-led/model_1.bin', 1.0),
        #('led-large', 24, '../input/fb-exp037-led/model_2.bin', 1.0),
        #('led-large', 24, '../input/fb-exp037-led/model_3.bin', 1.0),
        #('led-large', 24, '../input/fb-exp037-led/model_4.bin', 1.0),

        #('led-large', 24, '../input/fb-exp017-led/model_0.bin', 1.0),
        #('led-large', 24, '../input/fb-exp017-led/model_1.bin', 1.0),
        #('led-large', 24, '../input/fb-exp017-led/model_2.bin', 1.0),
        #('led-large', 24, '../input/fb-exp017-led/model_3.bin', 1.0),
        #('deberta-large', 22, '../input/fb-exp045-deberta/model_0.bin', 1.0),
        ('deberta-large', 15, '../input/2022022709-deberta-large-boe-bin/model_0.bin', 1.0),
    ]

    proba_thresh = {
        "Lead": 0.6, # 0.7
        "Position": 0.4, # 0.55
        "Evidence": 0.65,
        "Claim": 0.55,
        "Concluding Statement": 0.6, # 0.7
        "Counterclaim": 0.5,
        "Rebuttal": 0.55,
    }
    min_token_thresh = {
        "Lead": 5, # 9
        "Position": 4, # 5
        "Evidence": 14,
        "Claim": 2,
        "Concluding Statement": 7, # 11
        "Counterclaim": 6,
        "Rebuttal": 4,
    }
    link = {
        'Evidence': 40,
        'Counterclaim': 200,
        'Rebuttal': 200,
    }
    
    


cfg = Config()    
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}
id_target_map = {v: k for k, v in target_id_map.items()}

# Postprocess

In [None]:
def post_process(samples):
    sub = create_submission(samples)
    sub = thresh_prob(sub, cfg)
    sub = thresh_min_token(sub, cfg)
    sub = get_max_prob(sub)
    sub = link_class(sub, 'Evidence', cfg.link['Evidence'])
    sub = link_class(sub, 'Counterclaim', cfg.link['Counterclaim'])
    sub = link_class(sub, 'Rebuttal', cfg.link['Rebuttal'])
    sub = sub.reset_index(drop=True)
    sub = sub[['id', 'class', 'predictionstring']]
    return sub


def create_submission(samples):
    sub = []
    for _, sample in enumerate(samples):
        pred_class = sample['pred_class']
        offset_mapping = sample['offset_mapping']
        sample_id = sample['id']
        sample_text = sample['text']
        pred_prob = sample['pred_prob']
        
        second_class = sample['second_class']
        second_prob = sample['second_prob']
        
        third_class = sample['third_class']
        third_prob = sample['third_prob']
        

        pred_class = fix_list_(pred_class)

        sample_preds = []
        # テキストが4096より長い場合
        if len(pred_class) < len(offset_mapping)-1:
            pred_class += ['O'] * (len(offset_mapping) - len(pred_class))
            second_class += ['O'] * (len(offset_mapping) - len(second_class))
            third_class += ['O'] * (len(offset_mapping) - len(third_class))
            
            pred_prob += [0] * (len(offset_mapping) - len(pred_prob))
            second_prob += [0] * (len(offset_mapping) - len(second_prob))
            third_prob += [0] * (len(offset_mapping) - len(third_prob))

        idx = 0
        phrase_preds = []
        while idx < len(offset_mapping)-1:
            start, _ = offset_mapping[idx]
            if pred_class[idx] != 'O':
                label = pred_class[idx][2:]
            else:
                label = 'O'
                
            if second_class[idx] != 'O':
                second_label = second_class[idx][2:]
            else:
                second_label = 'O'
                
            if pred_class[idx] != 'O':
                third_label = third_class[idx][2:]
            else:
                third_label = 'O'
            
            
            phrase_probs = []
            phrase_probs.append(pred_prob[idx])
            
            phrase_second_probs = []
            phrase_second_probs.append(second_prob[idx])
            
            phrase_third_probs = []
            phrase_third_probs.append(third_prob[idx])
            
            idx += 1
            
            while idx < len(offset_mapping)-1:
                if label != 'O':
                    matching_label = f'I-{label}'
                else:
                    matching_label = 'O'
                if pred_class[idx] == matching_label:
                    _, end = offset_mapping[idx]
                    phrase_probs.append(pred_prob[idx])
                    phrase_second_probs.append(second_prob[idx])
                    phrase_third_probs.append(third_prob[idx])
                    
                    idx += 1
                else:
                    break
            if 'end' in locals():
                phrase = sample_text[start:end]
                phrase_preds.append((phrase, start, end, label, second_label, third_label, phrase_probs, phrase_second_probs, phrase_third_probs))

        temp_df = []
        for phrase_idx, (phrase, start, end, label, second_label, third_label, phrase_probs, phrase_second_probs, phrase_third_probs) in enumerate(phrase_preds):
            word_start = len(sample_text[:start].split())
            word_end = word_start + len(sample_text[start:end].split())
            word_end = min(word_end, len(sample_text.split()))
            ps = " ".join([str(x) for x in range(word_start, word_end)])
            if label != 'O':
                phrase_probs_mean = sum(phrase_probs) / len(phrase_probs)
                phrase_second_probs_mean = sum(phrase_second_probs) / len(phrase_probs)
                phrase_third_probs_mean = sum(phrase_third_probs) / len(phrase_probs)
                
                temp_df.append((sample_id, label, second_label, third_label, ps, phrase_probs_mean, phrase_second_probs_mean, phrase_third_probs_mean))
        temp_df = pd.DataFrame(temp_df, columns=['id', 'class', 'second_class', 'third_class', 'predictionstring', 'prob', 'second_prob', 'third_prob'])
        sub.append(temp_df)
    
    sub = pd.concat(sub).reset_index(drop=True)
    sub['len'] = sub['predictionstring'].apply(lambda x: len(x.split()))
    sub = sub[sub['len'] > 0]
    return sub

def fix_list(pred_list):

    class_list = ["I-Lead", "I-Position", "I-Evidence", "I-Claim", 
                  "I-Concluding Statement", "I-Counterclaim", "I-Rebuttal", "O"]
    
    fix_threholds = {
        "I-Lead":2, 
        "I-Concluding Statement":2, 
        "I-Evidence":1,
        "I-Position":2,
        "I-Claim":1,
        "I-Counterclaim":5, 
        "I-Rebuttal":7,
        "O":1
    }
    
    for class_ in class_list:

        flg_index = []
        out_class = [col for col in class_list if col not in class_]
        counter = 0

        for token_id, token in enumerate(pred_list):
            
            # 連続2回以上続いた後の別classにはflgを立てる
            if counter > 2 and token in out_class:
                flg_index.append(token_id)
                counter = 0

            if token == class_:
                counter += 1
            else:
                counter = 0
                
        for ind in flg_index:
            if ind + fix_threholds[class_] + 1 < len(pred_list):
                counter_2 = fix_threholds[class_]
                while counter_2 != 0:
                    if pred_list[ind + counter_2] == class_ and pred_list[ind + counter_2 + 1] == class_:
                        for i in range(counter_2):
                            pred_list[ind + i] = class_
                        counter_2 = 0
                    else:
                        counter_2 -= 1
                        
    return pred_list

def fix_list_(pred_list):
    class_list = ["I-Lead", "I-Position", "I-Evidence", "I-Claim", 
                  "I-Concluding Statement", "I-Counterclaim", "I-Rebuttal"]

    for class_ in class_list:
        flg_index = []
        out_class = set(class_list) - {class_}
        counter = 0

        for token_id, token in enumerate(pred_list):
            if counter > 2 and token in out_class:
                flg_index.append(token_id)
                counter = 0

            if token == class_:
                counter += 1
            else:
                counter = 0

        for ind in flg_index:
            if ind + 2 < len(pred_list):
                if pred_list[ind + 1] == class_ and pred_list[ind + 2] == class_:
                    pred_list[ind] = class_

    return pred_list


def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end] if x != -1])


def link_class(oof, discourse_type, thresh2):
    id_list = oof['id'].unique().tolist()
    if not len(oof):
        return oof
    thresh = 1
    idu = oof['id'].unique()
    eoof = oof[oof['class'] == f"{discourse_type}"]
    neoof = oof[oof['class'] != f"{discourse_type}"]
    eoof.index = eoof[['id', 'class']]
    
    retval = []
    for idv in idu:
        q = eoof[eoof['id'] == idv]
        if not len(q):
            continue
        pst = []
        for r in q.itertuples():
            pst = [*pst, -1,  *[int(x) for x in r.predictionstring.split()]]
        start, end = 1, 1
        for i in range(2, len(pst)):
            cur = pst[i]
            end = i
            if  (
                (cur == -1) and
                ((pst[i + 1] > pst[end - 1] + thresh) or (pst[i + 1] - pst[start] > thresh2))
            ):
                retval.append((idv, discourse_type, jn(pst, start, end)))
                start = i + 1
        v = (idv, discourse_type, jn(pst, start, end + 1))
        retval.append(v)

    roof = pd.DataFrame(retval, columns=['id', 'class', 'predictionstring'])
    roof = roof.merge(neoof, how='outer')
    
    dfs = []
    for doc_id in id_list:
        r_df_tmp = roof.query(f'id == "{doc_id}"')
        r_df_tmp['start'] = r_df_tmp['predictionstring'].apply(lambda x: int(x.split(' ')[0]))
        r_df_tmp = r_df_tmp.sort_values('start').drop('start', axis=1)
        dfs.append(r_df_tmp)
    return pd.concat(dfs, axis=0)


def thresh_prob(df, cfg):
    df_other = df[(df['class'] != 'Claim') | (df['len'] != 2)]
    df_target = df[(df['class'] == 'Claim') & (df['len'] == 2)]
    df_target['prob'] -= 0.1
    df = pd.concat([df_other, df_target])
    df = df.sort_index()
    for k, v in cfg.proba_thresh.items():
        idx = df.loc[df['class'] == k].query(f'prob < {v}').index
        df = df.drop(idx)
    return df

# add
def thresh_second_prob(df, cfg):
    df = df.sort_index()
    for k, v in cfg.second_proba_thresh.items():
        idx = df.loc[df['class'] == k].query(f'second_prob > {v}').index
        df = df.drop(idx)
    return df

# add
def thresh_third_prob(df, cfg):
    df = df.sort_index()
    for k, v in cfg.second_proba_thresh.items():
        idx = df.loc[df['class'] == k].query(f'third_prob > {v}').index
        df = df.drop(idx)
    return df


def thresh_min_token(df, cfg):
    df['len'] = df['predictionstring'].apply(lambda x: len(x.split(' ')))
    for k, v in cfg.min_token_thresh.items():
        idx = df.loc[df['class'] == k].query(f'len < {v}').index
        df = df.drop(idx)
    return df


def get_max_prob(sub):
    sub['prob'] = sub['prob'].astype(float)
    id_list = sub['id'].unique().tolist()
    unique_class = ['Lead', 'Position', 'Concluding Statement']
    sub_in_unique = sub[sub['class'].isin(unique_class) == True]
    sub_not_in_unique = sub[sub['class'].isin(unique_class) == False]
    sub_in_unique = sub_in_unique.loc[sub_in_unique.groupby(['id', 'class'])['prob'].idxmax(), :]
    sub = pd.concat([sub_in_unique, sub_not_in_unique])
    return sub

def post_process_sub(sub):
    sub = thresh_prob(sub, cfg)
    sub = thresh_min_token(sub, cfg)
    sub = get_max_prob(sub)
    sub = link_class(sub, 'Evidence', cfg.link['Evidence'])
    sub = link_class(sub, 'Counterclaim', cfg.link['Counterclaim'])
    sub = link_class(sub, 'Rebuttal', cfg.link['Rebuttal'])
    sub = sub.reset_index(drop=True)
    sub = sub[['id', 'class', 'predictionstring', 'prob', 'len']]
    return sub

# Scoring and add label

In [None]:
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(" "))
    set_gt = set(row.predictionstring_gt.split(" "))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp_micro(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    This code is from Rob Mulla's Kaggle kernel.
    """
    gt_df = gt_df[["id", "discourse_type", "predictionstring"]
                  ].reset_index(drop=True).copy()
    pred_df = pred_df[["id", "class", "predictionstring"]
                      ].reset_index(drop=True).copy()
    pred_df["pred_id"] = pred_df.index
    gt_df["gt_id"] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(
        gt_df,
        left_on=["id", "class"],
        right_on=["id", "discourse_type"],
        how="outer",
        suffixes=("_pred", "_gt"),
    )
    joined["predictionstring_gt"] = joined["predictionstring_gt"].fillna(" ")
    joined["predictionstring_pred"] = joined["predictionstring_pred"].fillna(
        " ")

    joined["overlaps"] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined["overlap1"] = joined["overlaps"].apply(lambda x: eval(str(x))[0])
    joined["overlap2"] = joined["overlaps"].apply(lambda x: eval(str(x))[1])

    joined["potential_TP"] = (joined["overlap1"] >= 0.5) & (
        joined["overlap2"] >= 0.5)
    
    #print(joined)
    
    joined["max_overlap"] = joined[["overlap1", "overlap2"]].max(axis=1)
    tp_pred_ids = (
        joined.query("potential_TP")
        .sort_values("max_overlap", ascending=False)
        .groupby(["id", "predictionstring_gt"])
        .first()["pred_id"]
        .values
    )
    
    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined["pred_id"].unique()
                   if p not in tp_pred_ids]

    matched_gt_ids = joined.query("potential_TP")["gt_id"].unique()
    unmatched_gt_ids = [c for c in joined["gt_id"].unique()
                        if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    # calc microf1
    my_f1_score = TP / (TP + 0.5 * (FP + FN))
    return my_f1_score, tp_pred_ids

def score_feedback_comp(pred_df, gt_df, return_class_scores=False):
    class_scores = {}
    pred_df = pred_df[["id", "class", "predictionstring"]
                      ].reset_index(drop=True).copy()
    
    tp_id_list = []
    pred_subset_list = []
    for discourse_type, gt_subset in gt_df.groupby("discourse_type"):
        pred_subset = pred_df.loc[pred_df["class"] ==
                                  discourse_type].reset_index(drop=True).copy()
        
        #print(pred_subset)
        class_score, tp_pred_ids = score_feedback_comp_micro(pred_subset, gt_subset)
        pred_subset['is_tp'] = pred_subset.index.isin(tp_pred_ids)
        print(class_score)
        pred_subset_list.append(pred_subset)
        
        
        tp_id_list += list(tp_pred_ids)
        class_scores[discourse_type] = class_score
    f1 = np.mean([v for v in class_scores.values()])
    if return_class_scores:
        return f1, class_scores, pd.concat(pred_subset_list)
    return f1, pd.concat(pred_subset_list)

In [None]:
paths_ = [
    #'../input/oof-pred-108/deberta-large-108.pkl',
    '../input/fb-oof-dict/oof_dict_yyama_exp045.pickle',
 #'../input/fb-oof-dict/oof_dict_makotu_021410-lf-bie-bin.pickle',
 '../input/fb-oof-dict/oof_dict_makabe_feedback-101_LSTM_2head.pickle',
 #'../input/fb-oof-dict/oof_dict_makotu_030209-deberta-large-mnli-boe-bin.pickle',
 '../input/fb-oof-dict/oof_dict_022709-deberta-large-boe-bin.pickle',
 '../input/fb-oof-dict/oof_dict_makabe_feedback-096.pickle',
 '../input/convert-fp64-to-fp16-inference98/oof_dict_makabe_feedback-098_LSTM.pickle',
 '../input/fb-oof-dict/oof_dict_yyama_exp058-wo-mlm.pickle',
'../input/fb-oof-dict/oof_dict_yyama_fb-exp037-led.pickle',
'../input/fb-oof-dict-exp062-xlarge-f24/oof_dict_yyama_fb_exp06x_xlarge_fold2-4.pickle']

In [None]:
test_df = pd.read_csv('../input/feedback-prize-2021/train.csv')

#test_df = pd.read_csv('../input/fb-train-folds/train_folds.csv')
#test_df = test_df[test_df['kfold']==fold_num].reset_index(drop=True)
test_ids = test_df['id'].unique()
test_ids = test_ids[~(test_ids=='AD005493F9BF')]
# for debug
# test_ids = test_ids[:100]
test_df = test_df[test_df['id'].isin(test_ids)]

In [None]:
fold_df = pd.read_csv('../input/makabe-fold-csv/makabe_fold.csv', encoding='utf-8-sig')
#fold_df = fold_df[['id', 'kfold']].drop_duplicates()
fold_dict = dict(zip(fold_df['id'], fold_df['fold']))

#fold_dict = dict(zip(test_ids, np.random.permutation(np.arange(len(test_ids))) % 5))

In [None]:
fold_df = pd.read_csv('../input/train-folds/train_folds.csv')
fold_df = fold_df[['id', 'kfold']].drop_duplicates()
fold_dict_2 = dict(zip(fold_df['id'], fold_df['kfold']))
test_df['fold'] = test_df['id'].map(fold_dict_2)

In [None]:
all_sub = pd.DataFrame()
for i, path in tqdm(enumerate(paths_)):
    # 1,2は作成に失敗したため
    #if i in [0,3,4,5]:
    model_name = path.split('/')[-1].replace(".pickle", "").replace("oof_dict_", "")
    sub = pd.read_pickle(f'../input/feats-0315/feats_{model_name}.pkl')

    #if sub.isna().mean().mean() < 0.05:
    #new_cols = list(sub.columns[:12]) + ['model_' + str(i // 75) + '_' + '_'.join(sub.columns[i+12].split('_')[-2:]) for i in list(range(450))] + ['model_name']
    #sub.columns = new_cols

    for col in sub.columns:
        if sub.dtypes[col] == np.int64:
            sub[col] = sub[col].astype(np.int32)
        if sub.dtypes[col] == np.float64:
            sub[col] = sub[col].astype(np.float16)

    _, true_sub = score_feedback_comp(sub[['id', 'class', 'predictionstring']], test_df, return_class_scores=False)
    sub['is_tp'] = sub.set_index(['id', 'class', 'predictionstring']).index.map(true_sub.set_index(['id', 'class', 'predictionstring'])['is_tp'].to_dict()).astype(int)
    sub['new_fold'] = sub['id'].map(fold_dict)
    all_sub = all_sub.append(sub.reset_index(drop=True))
    del sub

In [None]:
model_name = 'ensemble_0314'
ensemble_sub = pd.read_pickle(f'../input/feats-0315/feats_{model_name}.pkl')

# columnsの修正が終わったら消す
#new_cols = list(ensemble_sub.columns[:12]) + ['model_' + str(i // 75) + '_' + '_'.join(ensemble_sub.columns[i+12].split('_')[-2:]) for i in list(range(450))]
#ensemble_sub.columns = new_cols

for col in ensemble_sub.columns:
    if ensemble_sub.dtypes[col] == np.int64:
        ensemble_sub[col] = ensemble_sub[col].astype(np.int32)
    if ensemble_sub.dtypes[col] == np.float64:
        ensemble_sub[col] = ensemble_sub[col].astype(np.float16)

_, true_sub = score_feedback_comp(ensemble_sub[['id', 'class', 'predictionstring']], test_df[test_df.fold.isin([2,3,4])], return_class_scores=False)
ensemble_sub['is_tp'] = ensemble_sub.set_index(['id', 'class', 'predictionstring']).index.map(true_sub.set_index(['id', 'class', 'predictionstring'])['is_tp'].to_dict()).astype(int)
ensemble_sub['new_fold'] = ensemble_sub['id'].map(fold_dict)

In [None]:
len(set(ensemble_sub.columns))

In [None]:
for col in ['predictionstring', 'start', 'end']:
    dic = (all_sub.groupby(['id', 'class', col])['model_name'].nunique() / all_sub['model_name'].nunique()).to_dict()
    all_sub[f'dupli_{col}'] = all_sub.set_index(['id', 'class', col]).index.map(dic)
    ensemble_sub[f'dupli_{col}'] = ensemble_sub.set_index(['id', 'class', col]).index.map(dic)

In [None]:
for col in ['predictionstring', 'start', 'end']:
    ensemble_sub[f'dupli_{col}'] = ensemble_sub[f'dupli_{col}'].fillna(0)

In [None]:
all_sub = all_sub.drop_duplicates(subset=['id', 'class', 'predictionstring'])
all_sub = all_sub.dropna(subset=all_sub.columns[25:30])

In [None]:
all_sub = all_sub.reset_index(drop=True)
ensemble_sub = ensemble_sub.reset_index(drop=True)

In [None]:
feats = [col for col in all_sub.columns if not col in ['id', 'predictionstring', 'text', 'offset', 'fold', 'new_fold', 'is_tp', 'discourse_key', 'model_name']]

In [None]:
all_sub_df = all_sub[['id', 'class', 'second_class', 'third_class', 'predictionstring', 'prob', 'second_prob', 'third_prob', 'len', 'start', 'end']]
all_sub_folds = all_sub['fold']
all_sub_new_folds = all_sub['new_fold']
all_sub_targets = all_sub['is_tp']
all_sub = all_sub[feats]

In [None]:
ensemble_sub_df = ensemble_sub[['id', 'class', 'second_class', 'third_class', 'predictionstring', 'prob', 'second_prob', 'third_prob', 'len', 'start', 'end']]
ensemble_sub_folds = ensemble_sub['fold']
ensemble_sub_new_folds = ensemble_sub['new_fold']
ensemble_sub_targets = ensemble_sub['is_tp']
ensemble_sub = ensemble_sub[feats]

In [None]:
fold_dict

In [None]:
class_dict = dict(zip(['Claim', 'Concluding Statement', 'Counterclaim', 'Evidence',
       'Lead', 'Position', 'Rebuttal'], range(7)))

class_dict_inv = {v: k for k, v in class_dict.items()}

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plotImp(model, X , num = 28, fig_size = (40, 20)):
    feature_imp = pd.DataFrame({'Value':model.booster_.feature_importance(importance_type='gain'),'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features')
    plt.tight_layout()
    plt.show()

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.metrics import log_loss


def train_and_evaluate(all_sub_df, all_sub, all_sub_folds, all_sub_new_folds, all_sub_targets, ensemble_sub_df, ensemble_sub, ensemble_sub_folds, ensemble_sub_new_folds, ensemble_sub_targets):
    #kf = GroupKFold(n_splits=5)
    

    # 目的変数を除いて学習
    #X = all_sub.copy()
    all_sub['class'] = all_sub_df['class'].map(class_dict).fillna(7).astype(int)
    all_sub['second_class'] = all_sub_df['second_class'].map(class_dict).fillna(7).astype(int)
    all_sub['third_class'] = all_sub_df['third_class'].map(class_dict).fillna(7).astype(int)
    
    y = np.array(all_sub_targets).astype(int)
    
    weight = 1 + (all_sub_folds.isin([2, 3,4])) * 0
    
    #X_ens = ensemble_sub.copy()
    ensemble_sub['class'] = ensemble_sub_df['class'].map(class_dict).fillna(7).astype(int)
    ensemble_sub['second_class'] = ensemble_sub_df['second_class'].map(class_dict).fillna(7).astype(int)
    ensemble_sub['third_class'] = ensemble_sub_df['third_class'].map(class_dict).fillna(7).astype(int)
    y_ens = np.array(ensemble_sub_targets).astype(int)    

    # oof: out of foldの略。trainの学習モデルをクロスバリデーションの各foldのモデルで予測した結果で評価する。
    #oof = np.zeros(len(all_sub))
    oof_ens = np.zeros(len(ensemble_sub))
    cv_score = 0
        
    # クロスバリデーションで評価を行う
    for fold in range(5):
        train_index = all_sub_new_folds[all_sub_new_folds != fold].index
        val_index = all_sub_new_folds[all_sub_new_folds == fold].index
        val_index_ens = ensemble_sub_new_folds[ensemble_sub_new_folds == fold].index
        
        X_train, y_train,w_train  = all_sub.loc[train_index, :], y[train_index], weight[train_index]
        #X_val, y_val = all_sub.loc[val_index, :], y[val_index]
        X_val_ens, y_val_ens = ensemble_sub.loc[val_index_ens, :], y_ens[val_index_ens]

        # ハイパーパラメータはとりあえずで決めただけ
        model = LGBMClassifier(learning_rate=0.03, num_leaves=260, n_estimators=5000, colsample_bytree=0.6, subsample=0.8, subsample_freq=2,
                              min_child_samples=70,max_depth=-1, metrics=['binary_logloss', 'auc'])
        
        # 100エポック回してバリデーションセットの精度が改善しなかったら訓練終了
        model.fit(X_train, y_train,early_stopping_rounds=75,eval_set=[(X_train, y_train), (X_val_ens, y_val_ens)], verbose=25, sample_weight = w_train, categorical_feature=['class', 'second_class', 'third_class'])

        # LightGBMにおける特徴量の重要度
        plotImp(model, X_train)
        
        #y_pred = model.predict_proba(X_val)[:, 1]
        y_pred_ens = model.predict_proba(X_val_ens)[:, 1]
        fold_cv = log_loss(y_val_ens, y_pred_ens)
        #oof[val_index] = y_pred
        oof_ens[val_index_ens] = y_pred_ens
        print(f'fold {fold} BCE: {fold_cv}')
        #pred += model.predict(X_test) / 5
        cv_score += fold_cv / 5
        
        pickle.dump(model, open(f'lgb_fold{fold}.pkl', 'wb'))

    print(f'CV: {cv_score}')
    
    return oof_ens

In [None]:
oof_ens = train_and_evaluate(all_sub_df, all_sub, all_sub_folds, all_sub_new_folds, all_sub_targets, ensemble_sub_df, ensemble_sub, ensemble_sub_folds, ensemble_sub_new_folds, ensemble_sub_targets)

In [None]:
# feature selection


from sklearn.metrics import roc_auc_score
from tqdm import tqdm

dic_list = []
for fold in range(5):
    model = pickle.load(open(f'lgb_fold{fold}.pkl', 'rb'))
    train_index = all_sub_new_folds[all_sub_new_folds != fold].index
    val_index = all_sub_new_folds[all_sub_new_folds == fold].index
    val_index_ens = ensemble_sub_new_folds[ensemble_sub_new_folds == fold].index

    y_ens = np.array(ensemble_sub_targets).astype(int)
    X_val_ens, y_val_ens = ensemble_sub.loc[val_index_ens, :], y_ens[val_index_ens]

    dic = {}
    base_score = roc_auc_score(y_val_ens, model.predict_proba(X_val_ens)[:, 1])
    cols = [col.replace('Concluding Statement', 'Concluding_Statement') for col in X_val_ens.columns]
    for col in tqdm(model.feature_name_):
        val2 = X_val_ens.copy()
        val2.columns = cols
        val2[col] = np.random.permutation(val2[col].values)
        val2['pred'] = model.predict_proba(val2[model.feature_name_])[:, 1]
        perm_score = roc_auc_score(y_val_ens, val2['pred'])
        diff_score = base_score - perm_score
        dic[col] = diff_score

    dic_list.append(dic)

In [None]:
score_dic = {}
for col in model.feature_name_:
    score_dic[col] = 0
    for dic in dic_list:
        score_dic[col] += dic[col] / 5

In [None]:
len([score_dic[key] for key in score_dic.keys() if score_dic[key] <= 2e-5])

In [None]:
score_dic

In [None]:
def train_and_evaluate(all_sub_df, all_sub, all_sub_folds, all_sub_new_folds, all_sub_targets, ensemble_sub_df, ensemble_sub, ensemble_sub_folds, ensemble_sub_new_folds, ensemble_sub_targets):
    #kf = GroupKFold(n_splits=5)
    

    # 目的変数を除いて学習
    #X = all_sub.copy()
    all_sub['class'] = all_sub_df['class'].map(class_dict).fillna(7).astype(int)
    all_sub['second_class'] = all_sub_df['second_class'].map(class_dict).fillna(7).astype(int)
    all_sub['third_class'] = all_sub_df['third_class'].map(class_dict).fillna(7).astype(int)
    
    y = np.array(all_sub_targets).astype(int)
    weight = 1 + (all_sub_folds.isin([2, 3,4])) * 0
    
    #X_ens = ensemble_sub.copy()
    ensemble_sub['class'] = ensemble_sub_df['class'].map(class_dict).fillna(7).astype(int)
    ensemble_sub['second_class'] = ensemble_sub_df['second_class'].map(class_dict).fillna(7).astype(int)
    ensemble_sub['third_class'] = ensemble_sub_df['third_class'].map(class_dict).fillna(7).astype(int)
    y_ens = np.array(ensemble_sub_targets).astype(int)    

    # oof: out of foldの略。trainの学習モデルをクロスバリデーションの各foldのモデルで予測した結果で評価する。
    #oof = np.zeros(len(all_sub))
    oof_ens = np.zeros(len(ensemble_sub))
    cv_score = 0
    
    selected_feats = [key for key in score_dic.keys() if score_dic[key] > 2e-5]
    cols = [col.replace('Concluding Statement', 'Concluding_Statement') for col in ensemble_sub.columns]
    ensemble_sub.columns = cols
    all_sub.columns = cols
        
    # クロスバリデーションで評価を行う
    for fold in range(5):
        train_index = all_sub_new_folds[all_sub_new_folds != fold].index
        val_index = all_sub_new_folds[all_sub_new_folds == fold].index
        val_index_ens = ensemble_sub_new_folds[ensemble_sub_new_folds == fold].index
        
        X_train, y_train,w_train  = all_sub.loc[train_index, selected_feats], y[train_index], weight[train_index]
        #X_val, y_val = all_sub.loc[val_index, :], y[val_index]
        X_val_ens, y_val_ens = ensemble_sub.loc[val_index_ens, selected_feats], y_ens[val_index_ens]

        # ハイパーパラメータはとりあえずで決めただけ
        model = LGBMClassifier(learning_rate=0.01, num_leaves=170, n_estimators=5000, colsample_bytree=0.6, subsample=0.8, subsample_freq=1,
                              min_child_samples=70, max_depth=-1, metrics=['binary_logloss', 'auc'])
        
        # 100エポック回してバリデーションセットの精度が改善しなかったら訓練終了
        model.fit(X_train, y_train,early_stopping_rounds=75,eval_set=[(X_train, y_train), (X_val_ens, y_val_ens)], sample_weight = w_train, verbose=25, categorical_feature=[col for col in ['class', 'second_class', 'third_class'] if col in selected_feats])

        # LightGBMにおける特徴量の重要度
        plotImp(model, X_train)
        
        #y_pred = model.predict_proba(X_val)[:, 1]
        y_pred_ens = model.predict_proba(X_val_ens)[:, 1]
        fold_cv = log_loss(y_val_ens, y_pred_ens)
        #oof[val_index] = y_pred
        oof_ens[val_index_ens] = y_pred_ens
        print(f'fold {fold} BCE: {fold_cv}')
        #pred += model.predict(X_test) / 5
        cv_score += fold_cv / 5
        
        pickle.dump(model, open(f'lgb_fold{fold}.pkl', 'wb'))

    print(f'CV: {cv_score}')
    
    return oof_ens

In [None]:
oof_ens_2 = train_and_evaluate(all_sub_df, all_sub, all_sub_folds, all_sub_new_folds, all_sub_targets, ensemble_sub_df, ensemble_sub, ensemble_sub_folds, ensemble_sub_new_folds, ensemble_sub_targets)

In [None]:
pred3 = ensemble_sub_df.copy()
pred3['disc_prob'] = oof_ens_2

cfg.proba_thresh = {
        "Lead": 0.49, # 0.49
        "Position": 0.32, # 0.29
        "Evidence": 0.54, # 0.54
        "Claim": 0.44, # 0.44
        "Concluding Statement": 0.49, # 0.49
        "Counterclaim": 0.48, # 0.48
        "Rebuttal": 0.44, # 0.44
    }
cfg.min_token_thresh = {
        "Lead": 4, # 4
        "Position": 3, # 3
        "Evidence": 11, # 11
        "Claim": 1, # 1
        "Concluding Statement": 7, # 7
        "Counterclaim": 4, # 4
        "Rebuttal": 3, # 3
    }
cfg.link = {
        'Evidence': 40,
        'Counterclaim': 200,
        'Rebuttal': 200,
    }

pred3['proba_thresh'] = pred3['class'].map(target_id_map)
pred3 = pred3[((pred3['disc_prob'] > 0.17) & ((pred3['prob'] - pred3['proba_thresh'] > 0.1) | (pred3['disc_prob'] > 0.26) | ((pred3['class'].isin(['Counterclaim']) & (pred3['prob'] - pred3['second_prob'] > 0.25)) | (pred3['class'].isin(['Rebuttal']))
                                             )))][['id', 'class', 'predictionstring', 'prob', 'len']]

pred3 = post_process_sub(pred3[['id', 'class', 'predictionstring', 'prob', 'len']])

f1, pred3 = score_feedback_comp(pred3[['id', 'class', 'predictionstring']], test_df[test_df.fold.isin([2,3,4])], return_class_scores=False)
print(f1)

In [None]:
# 0.19 1 0.28 counterclaim 0.25
0.6933596454784656
0.874198322644302
0.5936626281453867
0.7840398625948435
0.8470567153480871
0.7459136512230865
0.5047579644187009
0.7204269699789817

#
0.692765370354402
0.8741607637819526
0.5951492537313433
0.7838453858760148
0.8472545161867286
0.7457284172661871
0.5045155993431856
0.7204884723628304





In [None]:
# 0.19 0.1 0.28

0.6898861768445864
0.8738632162661738
0.5830446672743846
0.7806852452325345
0.8475780409041981
0.741726327164663
0.5096870342771982
0.7180672439948198


# 0.17 0.1 0.26 add counterclaim

0.6901163568276046
0.8740357999630928
0.5869024592428848
0.7805071225715935
0.8472408790500242
0.7415359763074645
0.5097701855720782
0.7185869685049633


# 0.16 0.1 0.27 add counterclaim 0.45

0.6901163568276046
0.8740357999630928
0.5869024592428848
0.7805071225715935
0.8472408790500242
0.7415359763074645
0.5097701855720782
0.7185869685049633

# 0.18 0.1 0.27 feature selection and lr=0.01
0.6900359501024939
0.8744000590711068
0.5874163804013741
0.7809275723427148
0.8477326456049861
0.7418387629282754
0.5109578921447919
0.7190441803708205

In [None]:
0.686601218261361
0.8728387764149549
0.5840014577259475
0.781573323382747
0.8487255165012074
0.7412634635513388
0.5087546239210851
0.7176797685369489

In [None]:
(0.7149, {'Claim': 0.6824, 'Concluding Statement': 0.8737, 'Counterclaim': 0.5845, 'Evidence': 0.7774, 'Lead': 0.8445, 'Position': 0.739, 'Rebuttal': 0.5025})