In [None]:
pip install -r drive/MyDrive/SRWS-PSG/config/requirements.txt --quiet

## Packages

In [None]:
import logging
import os
import pickle
import random
import re
from collections import defaultdict
from time import sleep

import pandas as pd
import numpy as np
from adabelief_tf import AdaBeliefOptimizer
from omegaconf import OmegaConf

from sklearn.metrics import fbeta_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import backend as K

import transformers
from transformers import (
    AutoTokenizer,
    AutoConfig,
    TFAutoModel,
)

pd.options.display.precision = 5
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 200)

tf.get_logger().setLevel('ERROR')

import matplotlib.pyplot as plt
%matplotlib inline

## Parameters

In [None]:
config_path = '/content/drive/MyDrive/SRWS-PSG/config/test/2021-09-21_235528.yml'
args = OmegaConf.load(config_path)

In [None]:
if ('cache_dir' in args.keys()) and ('model_dir' in args.keys()):
    os.makedirs(args.cache_dir, exist_ok=True)
    os.makedirs(args.model_dir, exist_ok=True)

## DataIO

In [None]:
def dump(target, filepath, protocol=3):
    with open(filepath, 'wb') as f:
        pickle.dump(target, f, protocol=protocol)

def load(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

In [None]:
def create_logger(dir_path):
    logger = logging.getLogger(__name__)
    logger.handlers = []
    
    stream_handler = logging.StreamHandler()
    file_handler = logging.FileHandler(os.path.join(dir_path, 'experiment.log'))

    formatter = logging.Formatter('[%(asctime)s][%(levelname)s]:%(message)s')
    stream_handler.setFormatter(formatter)
    file_handler.setFormatter(formatter)

    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    logger.setLevel(logging.INFO)
    return logger

In [None]:
def read_data(args):
    df_train = pd.read_csv(f"{args.data_dir}/train.csv")
    df_test = pd.read_csv(f"{args.data_dir}/test.csv")

    df_train.loc[[2488, 7708], 'judgement'] = 0

    df_train['abstract'] = df_train['abstract'].fillna("")
    df_test['abstract'] = df_test['abstract'].fillna("")
    return df_train, df_test

## Model

In [None]:
try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', TPU.cluster_spec().as_dict()['worker'])
except ValueError:
    TPU = None
    print('INFO: Not connected to a TPU runtime')

INFO: Not connected to a TPU runtime


In [None]:
def build_model(args):
    if args.mode == 'train':
        transformer = TFAutoModel.from_pretrained(args.model_name, cache_dir=args.cache_dir, from_pt=args.from_pt)
    else:
        config = AutoConfig.from_pretrained(args.model_name)
        transformer = TFAutoModel.from_config(config)
    
    input_ids = tf.keras.layers.Input(shape=(args.max_length, ), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(args.max_length, ), dtype=tf.int32, name='attention_mask')

    x = transformer(input_ids, attention_mask=attention_mask)
    x = x[0][:, 0, :]
    x = tf.keras.layers.Dropout(args.dropout_rate)(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=[output])

    if args.mode == 'train':
        optimizer = AdaBeliefOptimizer(**args.optimizer)

        model.compile(
            optimizer=optimizer,
            loss=[tfa.losses.SigmoidFocalCrossEntropy()],
        )
    
    return model


def initialize_tpu_and_get_model(args):
    if not TPU:
        return build_model(args)
    
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    tpu_strategy = tf.distribute.TPUStrategy(TPU)

    with tpu_strategy.scope():
        model = build_model(args)

    return model

## DataLoader

In [None]:
class DataProvider:
    def __init__(self, df_train, df_test, args):
        self.df_train = df_train
        self.df_test = df_test
        self.tokenizer = AutoTokenizer.from_pretrained(args.model_name, cache_dir=args.cache_dir)
        self.args = args

        kfold = StratifiedKFold(n_splits=args.n_splits, random_state=args.seed, shuffle=True)
        self.indices = list(kfold.split(df_train, df_train[args.target_col]))

    def tokenize(self, texts):
        inputs = self.tokenizer.batch_encode_plus(
            texts,
            padding='max_length',
            truncation=True,
            max_length=self.args.max_length,
            return_tensors='tf',
        )
        return dict(inputs)
    
    def create_dataset(self, X, y=None, batch_size=None, mode='train'):
        data = (X, y) if y is not None else X
        dataset = tf.data.Dataset.from_tensor_slices(data)

        if mode == 'train':
            dataset = dataset.shuffle(2048)
        
        dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

        if mode == 'train' and self.args.steps_per_epoch:
            dataset = dataset.repeat()

        return dataset
    
    def create_train_dataset(self, fold):
        train_idx, _ = self.indices[fold]

        train_data = self.df_train[self.args.feature_cols].iloc[train_idx]
        train_data = self.tokenize(train_data.values.tolist())
        train_labels = self.df_train[self.args.target_col].iloc[train_idx].values

        train_dataset = self.create_dataset(
            train_data, train_labels, self.args.train_batch_size, mode='train')
        
        return train_dataset
    
    def create_valid_dataset(self, fold):
        _, valid_idx = self.indices[fold]

        valid_data = self.df_train[self.args.feature_cols].iloc[valid_idx]
        valid_data = self.tokenize(valid_data.values.tolist())
        valid_labels = self.df_train[self.args.target_col].iloc[valid_idx].values

        valid_dataset = self.create_dataset(
            valid_data, valid_labels, self.args.valid_batch_size, mode='valid')
        
        return valid_dataset, valid_labels

    def create_test_dataset(self):
        test_data = self.df_test[self.args.feature_cols]
        test_data = self.tokenize(test_data.values.tolist())
        test_dataset = self.create_dataset(test_data, batch_size=self.args.test_batch_size, mode='test')
        return test_dataset
    
    def create_valid_labels(self, fold):
        _, valid_idx = self.indices[fold]
        valid_labels = self.df_train[self.args.target_col].iloc[valid_idx].values
        return valid_labels

## Utils

In [None]:
def seed_everything(seed=2021):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)


def optimize_threshold(y_true, y_pred):
    best_threshold = 0
    best_score = 0

    for threshold in np.linspace(0, 1, 1001, dtype=np.float32):
        current_preds = (np.asarray(y_pred) > threshold).astype(int)
        current_score = fbeta_score(y_true, current_preds, beta=7)

        if current_score > best_score:
            best_threshold = threshold
            best_score = current_score
            
    return best_threshold


def calc_score(y_true, y_pred):
    score = {}
    score['auc'] = roc_auc_score(y_true, y_pred)

    threshold = optimize_threshold(y_true, y_pred)
    y_pred = (np.asarray(y_pred) >= threshold).astype(int)

    score['threshold'] = threshold
    score['fbeta_score'] = fbeta_score(y_true, y_pred, beta=7)
    return score


def log_metrics(logger, fold, score, epoch=None):
    msg = [f'fold: {fold}']

    if epoch is not None:
        msg += [f'epoch: {epoch}']

    for key in score.keys():
        msg += [f"{key}: {score[key]:.5f}"]

    msg = ', '.join(msg)
    logger.info(msg)


def create_graph(result, args):
    dir_path = f"{args.save_dir}/graphs"
    os.makedirs(dir_path, exist_ok=True)

    pd.DataFrame(result.history).plot(ylim=[0, 0.02])
    plt.savefig(f"{dir_path}/fold{args.fold}_history.png")


def read_threshold(filepath):
    with open(filepath, 'r') as f:
        lines = f.readlines()
    
    results = []
    for line in lines:
        match = re.search(r"best thresholds", line)
        if not match:
            continue

        thresholds = eval(line[match.span()[1]+2:])
        results.append(thresholds)
    
    return np.round(np.round(results, 3).mean(), 5)

## Trainer

In [None]:
class CalcMetricAndSaveModel(tf.keras.callbacks.Callback):
    def __init__(self, valid_dataset, valid_labels, logger, args):
        mode_candidates = {'min': 0, 'max': 1}
        params = args.checkpoint

        self.valid_dataset = valid_dataset
        self.valid_labels = valid_labels
        self.logger = logger
        self.args = args

        self.monitor = params.monitor
        self.mode = mode_candidates[params.mode]

        self.best_scores = [params.default_score] * params.n_best
        self.best_thresholds = [0.1] * params.n_best

    def on_epoch_end(self, epoch, logs=None):
        preds = self.model.predict(self.valid_dataset).reshape(-1)
        score = calc_score(self.valid_labels, preds)
        score.update(logs)
        log_metrics(self.logger, self.args.fold, score, epoch)

        idx = np.argmin(self.best_scores)
        best_score = self.best_scores[idx]

        if self.mode:
            judgement = score[self.monitor] > best_score
        else:
            judgement = score[self.monitor] < best_score

        if judgement:
            model_path = f"{self.args.model_dir}/{os.path.basename(self.args.model_name)}_fold{self.args.fold}_{idx}.h5"
            msg = f"{self.monitor} improved from {best_score:.5f} to {score[self.monitor]:.5f}, "
            msg += f"saving model to {model_path}"
            self.logger.info(msg)

            self.save_model(model_path)

            self.best_scores[idx] = score[self.monitor]
            self.best_thresholds[idx] = score['threshold']
        else:
            msg = f"{self.monitor} did not improve from {best_score:.5f}"
            self.logger.info(msg)
        
    def on_train_end(self, logs=None):
        self.logger.info(f"best scores: {self.best_scores}")
        self.logger.info(f"best thresholds: {self.best_thresholds}")
        self.logger.info(f"average score: {np.mean(self.best_scores):.5f}")
        self.logger.info(f"average threshold: {np.mean(self.best_thresholds):.5f}")
    
    def save_model(self, model_path, tries=3, delay=5):
        for i in range(1, tries + 1):
            try:
                self.model.save_weights(model_path)
            except Exception as e:
                self.logger.warning(f"{e.__class__.__name__}: {e} retry:{i}/{tries}")
                sleep(delay)
            else:
                return

        raise RuntimeError("Could not save model")

In [None]:
def train_cv(data_provider, logger, args):
    logger.info(args)

    for fold in args.folds:
        args.update({'fold': fold})
        K.clear_session()

        seed_everything(args.seed + fold)
        model = initialize_tpu_and_get_model(args)

        train_dataset = data_provider.create_train_dataset(fold)
        valid_dataset, valid_labels = data_provider.create_valid_dataset(fold)

        callbacks = [
            CalcMetricAndSaveModel(valid_dataset, valid_labels, logger, args)
        ]

        result = model.fit(
            train_dataset,
            epochs=args.epochs,
            verbose=1,
            callbacks=callbacks,
            validation_data=valid_dataset,
            steps_per_epoch=args.steps_per_epoch,
        )
        create_graph(result, args)

In [None]:
def evaluate(data_provider, logger, args):
    model = initialize_tpu_and_get_model(args)

    dir_path = f"{args.save_dir}/oof_train"
    os.makedirs(dir_path, exist_ok=True)

    thresholds = defaultdict(list)
    for fold in args.folds:
        results = pd.DataFrame()

        for idx in range(args.checkpoint.n_best):
            save_path = f"{args.model_dir}/{os.path.basename(args.model_name)}_fold{fold}_{idx}.h5"
            model.load_weights(save_path)

            valid_dataset, y_true = data_provider.create_valid_dataset(fold)
            y_pred = model.predict(valid_dataset, verbose=1).reshape(-1)

            score = calc_score(y_true, y_pred)
            log_metrics(logger, fold, score)
            
            threshold = float(score['threshold'])
            results[f"fold{fold}_{idx}_raw"] = y_pred
            thresholds[f"fold{fold}"].append(threshold)
        
        results.to_csv(f"{dir_path}/fold{fold}.csv", index=None)
    
    thresholds = OmegaConf.create(dict(thresholds))
    OmegaConf.save(thresholds, f"{args.save_dir}/thresholds.yml")

In [None]:
def predict(data_provider, logger, args):
    model = initialize_tpu_and_get_model(args)

    results = pd.DataFrame()
    for fold in args.folds:
        for idx in range(args.checkpoint.n_best):
            save_path = f"{args.model_dir}/{os.path.basename(args.model_name)}_fold{fold}_{idx}.h5"
            model.load_weights(save_path)

            test_dataset = data_provider.create_test_dataset() # ここに書かないとだめ
            preds = model.predict(test_dataset, verbose=1).reshape(-1)
            results[f"fold{fold}_{idx}_raw"] = preds
    
    results.to_csv(f"{args.save_dir}/oof_test.csv", index=None)
    return results

In [None]:
def create_submission(preds, args):
    submission = pd.read_csv(f"{args.data_dir}/sample_submit.csv", header=None)
    submission.columns = ['id', 'judgement']
    submission['judgement'] = preds
    
    filepath = f"{args.save_dir}/submission.csv"
    submission.to_csv(filepath, index=False, header=False)


def ensemble(args):
    save_dir = f"{args.drive_dir}/experiments"

    df = pd.DataFrame()
    for date in args.dates:
        df = pd.concat([df, pd.read_csv(f"{save_dir}/{date}/oof_test.csv")], axis=1)

    threshold = np.mean(args.thresholds)
    preds = (df.mean(axis=1) > threshold).astype(int).values
    
    create_submission(preds, args)

## Main

In [None]:
def main(args):
    if args.mode == 'ensemble':
        ensemble(args)
        return
    
    logger = create_logger(args.save_dir)
    df_train, df_test = read_data(args)
    data_provider = DataProvider(df_train, df_test, args)

    switch = {
        'train': train_cv,
        'evaluate': evaluate,
        'predict': predict,
    }
    switch[args.mode](data_provider, logger, args)

main(args)