In [1]:
NAME = "ex11_ex15_ex16_pseudo3_votingclassifier-fit_v2"

In [2]:
import os, sys

if "google.colab" in sys.modules:
    CP_DIR = f"/content/drive/MyDrive/Work/probspace_religious_art/notebook/{NAME}_colab/output"
    OUTPUT_DIR = "output"
    INPUT_DIR = "./eda_output/output"
    sys.path.append("/content/drive/MyDrive/Work/probspace_religious_art/code")
elif "kaggle_web_client" in sys.modules:
    pass
elif "/kqi/output" in os.getcwd():
    pass
else:
    # local
    CP_DIR = "output"
    OUTPUT_DIR = "output"
    INPUT_DIR = "../../eda/output"

In [3]:
# hyperparameters
n_seeds = 5
n_splits = 5
shuffle = True
label_smoothing = 1e-3
patience = 5000

# train set load

In [4]:
import pandas as pd

# ====================================================
# Data Load
# ====================================================
train = pd.read_csv(INPUT_DIR + "/train.csv")

train_size = train.shape[0]
n_classes = 13
columns = list(range(n_classes))

In [5]:
train

Unnamed: 0,image_id,label
0,0,5
1,1,11
2,2,8
3,3,2
4,4,6
...,...,...
649,649,2
650,650,3
651,651,2
652,652,0


In [6]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, n_classes, nan_as_category=True):
    original_columns = list(df.columns)
    df = pd.get_dummies(df, columns=df.columns, dummy_na=nan_as_category)
    df.columns = columns
    return df
Y = one_hot_encoder(train[["label"]], n_classes, nan_as_category=False)
Y

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,0,0,1,0,0,0,0,0,0,0,0,0,0
650,0,0,0,1,0,0,0,0,0,0,0,0,0
651,0,0,1,0,0,0,0,0,0,0,0,0,0
652,1,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
def score(y, y_pred, eps=1e-15, label_smoothing=0.0):
    y = np.asarray(y)
    y = np.ravel(y)

    if label_smoothing > 0.0:
        y = y * (1.0 - label_smoothing) + 0.5 * label_smoothing

    y_pred = np.asarray(y_pred)
    y_pred = np.ravel(y_pred)
    y_pred = np.clip(y_pred, eps, 1.0 - eps)

    return -np.mean(y * np.log(y_pred) + (1.0 - y) * np.log(1.0 - y_pred))  # log_loss

# 確信度のデータの読み込み

In [8]:
 # oofの予測csvが置いてあるpath
CSV_DIR = "inf_cv"

In [9]:
import glob
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

In [10]:
# oofの確信度csvのパス
paths = [Path(p) for p in glob.glob(CSV_DIR + "/*.csv")]
n_models = len(paths)
print(n_models, paths)

9 [WindowsPath('inf_cv/ex11_pseudo3_inf_cv_seed0.csv'), WindowsPath('inf_cv/ex11_pseudo3_inf_cv_seed1.csv'), WindowsPath('inf_cv/ex11_pseudo3_inf_cv_seed2.csv'), WindowsPath('inf_cv/ex15_pseudo3_inf_cv_seed0.csv'), WindowsPath('inf_cv/ex15_pseudo3_inf_cv_seed1.csv'), WindowsPath('inf_cv/ex15_pseudo3_inf_cv_seed2.csv'), WindowsPath('inf_cv/ex16_pseudo3_inf_cv_seed0.csv'), WindowsPath('inf_cv/ex16_pseudo3_inf_cv_seed1.csv'), WindowsPath('inf_cv/ex16_pseudo3_inf_cv_seed2.csv')]


In [11]:
# oofの確信度だけにする
result = pd.DataFrame(index=paths)
Y_preds = []
for i, path in enumerate(paths):
    Y_pred = pd.read_csv(path).sort_values(by="image_id")
    Y_pred = Y_pred.drop(["image_id", "file_path", "label"], axis=1)
    
    Y_preds.append(Y_pred.values)
    
    result.loc[path, "oof_logloss"] = score(Y, Y_pred)
    result.loc[path, "oof_accuracy_score"] = accuracy_score(Y.values.argmax(1), Y_pred.values.argmax(1))
    
Y_preds = np.asarray(Y_preds)

In [12]:
result

Unnamed: 0,oof_logloss,oof_accuracy_score
inf_cv\ex11_pseudo3_inf_cv_seed0.csv,0.180886,0.671254
inf_cv\ex11_pseudo3_inf_cv_seed1.csv,0.19539,0.671254
inf_cv\ex11_pseudo3_inf_cv_seed2.csv,0.193809,0.666667
inf_cv\ex15_pseudo3_inf_cv_seed0.csv,0.215463,0.674312
inf_cv\ex15_pseudo3_inf_cv_seed1.csv,0.203645,0.657492
inf_cv\ex15_pseudo3_inf_cv_seed2.csv,0.19501,0.66055
inf_cv\ex16_pseudo3_inf_cv_seed0.csv,0.14244,0.674312
inf_cv\ex16_pseudo3_inf_cv_seed1.csv,0.140975,0.675841
inf_cv\ex16_pseudo3_inf_cv_seed2.csv,0.136883,0.668196


In [13]:
# oofの相関係数確認
corr = np.empty((n_models, n_models))
corr = pd.DataFrame(corr, columns=paths, index=paths)

for i, row in enumerate(paths):
    for j, column in enumerate(paths):
        if i <= j:
            corr.loc[row, column] = 0
        else:
            df = pd.DataFrame(Y_preds[i])
            other = pd.DataFrame(Y_preds[j])

            corr.loc[row, column] = df.corrwith(other).mean()

corr.style.background_gradient(cmap="Blues", subset=paths, vmax=1.0, vmin=0.0)

Unnamed: 0,inf_cv\ex11_pseudo3_inf_cv_seed0.csv,inf_cv\ex11_pseudo3_inf_cv_seed1.csv,inf_cv\ex11_pseudo3_inf_cv_seed2.csv,inf_cv\ex15_pseudo3_inf_cv_seed0.csv,inf_cv\ex15_pseudo3_inf_cv_seed1.csv,inf_cv\ex15_pseudo3_inf_cv_seed2.csv,inf_cv\ex16_pseudo3_inf_cv_seed0.csv,inf_cv\ex16_pseudo3_inf_cv_seed1.csv,inf_cv\ex16_pseudo3_inf_cv_seed2.csv
inf_cv\ex11_pseudo3_inf_cv_seed0.csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
inf_cv\ex11_pseudo3_inf_cv_seed1.csv,0.768165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
inf_cv\ex11_pseudo3_inf_cv_seed2.csv,0.783421,0.77446,0.0,0.0,0.0,0.0,0.0,0.0,0.0
inf_cv\ex15_pseudo3_inf_cv_seed0.csv,0.803819,0.770995,0.773028,0.0,0.0,0.0,0.0,0.0,0.0
inf_cv\ex15_pseudo3_inf_cv_seed1.csv,0.760739,0.771679,0.758969,0.778076,0.0,0.0,0.0,0.0,0.0
inf_cv\ex15_pseudo3_inf_cv_seed2.csv,0.77249,0.756284,0.79282,0.780914,0.767117,0.0,0.0,0.0,0.0
inf_cv\ex16_pseudo3_inf_cv_seed0.csv,0.770447,0.747563,0.7499,0.783334,0.735036,0.74475,0.0,0.0,0.0
inf_cv\ex16_pseudo3_inf_cv_seed1.csv,0.767813,0.782111,0.772831,0.781982,0.778507,0.747064,0.818012,0.0,0.0
inf_cv\ex16_pseudo3_inf_cv_seed2.csv,0.776747,0.768856,0.78794,0.777334,0.767283,0.781744,0.820541,0.831126,0.0


# 加重平均の重み optimization 

In [14]:
import pickle
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.optimize import minimize_scalar
from sklearn.metrics import accuracy_score

In [15]:
class ObjectiveWithEarlyStopping(object):
    def __init__(
        self, y_true, y_preds, y_true_valid, y_preds_valid, label_smoothing=0.0, patience=30
    ):
        self.y_true = np.asarray(y_true)
        self.y_preds = np.asarray(y_preds)
        self.y_true_valid = np.asarray(y_true_valid)
        self.y_preds_valid = np.asarray(y_preds_valid)
        self.label_smoothing = label_smoothing
        self.patience = patience

        self._nit = 0
        self._wait = 0
        self._best_score = np.inf
        self._best_weights = None

    def __call__(self, params):
        train_score = self._objective(
            params,
            self.y_true,
            self.y_preds,
            label_smoothing=self.label_smoothing,
        )
        valid_score = self._objective(
            params,
            self.y_true_valid,
            self.y_preds_valid,
            label_smoothing=self.label_smoothing,
        )

        self._nit += 1

        if valid_score < self._best_score:
            self._wait = 0
            self._best_score = valid_score
            self._best_params = params
        else:
            self._wait += 1

        if self._wait >= self.patience:
            raise RuntimeError(f"Epoch {self._nit}: early stopping")

        return train_score


#class ObjectiveForThresholdWithEarlyStopping(ObjectiveWithEarlyStopping):
#    @property
#    def _objective(self):
#         return objective_for_threshold


class ObjectiveForWeightsWithEarlyStopping(ObjectiveWithEarlyStopping):
    @property
    def _objective(self):
         return objective_for_weights


#def objective_for_threshold(exponent, y_true, y_pred, label_smoothing=0.0):
#    threshold = 10.0 ** exponent
#    y_pred = y_pred.copy()
#    y_pred[y_pred < threshold] = 0.0
#    y_pred[y_pred > 1.0 - threshold] = 1.0
#
#    return score(y_true, y_pred, label_smoothing=label_smoothing)


def objective_for_weights(weights, y_true, y_preds, label_smoothing=0.0):
    y_pred = np.tensordot(weights, y_preds, axes=(0, 0))
    
    return score(y_true, y_pred, label_smoothing=label_smoothing)


In [16]:
from sklearn.model_selection import StratifiedKFold

def cv_split(df, seed, n_splits=n_splits, shuffle=shuffle):
    folds = df.copy()
    cv = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=seed)
    for j, (train_idx, valid_idx) in enumerate(cv.split(df, df["label"])):
        folds.loc[valid_idx, "fold"] = int(j)
    folds["fold"] = folds["fold"].astype(int)
    print(folds.groupby(["fold"]).size())
    return folds

In [17]:
#%%time
Y_pred = np.zeros((train_size, n_classes))
Y_pred = pd.DataFrame(Y_pred, columns=Y.columns, dtype="float", index=Y.index)

weights = np.zeros((n_classes, n_models))
n_iters = np.zeros(n_classes)

x0 = np.ones(n_models) / n_models
bounds = [(0.0, 1.0) for _ in range(n_models)]
constraints = {
    "type": "eq",
    "fun": lambda x: np.sum(x) - 1.0,
    "jac": lambda x: np.ones_like(x),
}
options = {"ftol": 0.0, "maxiter": 1_000_000}

for i in range(n_seeds):
    fold = cv_split(train, i, n_splits=n_splits)
    
    for j in range(n_splits):
        train_idx = fold[fold["fold"] != j].index
        valid_idx = fold[fold["fold"] == j].index
        
        for k in range(n_classes):
            objective = ObjectiveForWeightsWithEarlyStopping(
                Y.iloc[train_idx, [k]],
                Y_preds[:, train_idx, [k]],
                Y.iloc[valid_idx, [k]],
                Y_preds[:, valid_idx, [k]],
                label_smoothing=label_smoothing,
                patience=patience,
            )

            try:
                res = minimize(
                    objective,
                    x0,
                    bounds=bounds,
                    constraints=constraints,
                    method="SLSQP",
                    options=options,
                )
            except RuntimeError:
                pass

            weights[k] += objective._best_params / n_seeds / n_splits
            n_iters[k] += objective._nit / n_seeds / n_splits

            Y_pred.iloc[valid_idx, k] += np.tensordot(
                objective._best_params, Y_preds[:, valid_idx, [k]], axes=(0, 0)
            ) / n_seeds

#pickle.dump(weights, open("weights.pkl", 'wb'))

fold
0    131
1    131
2    131
3    131
4    130
dtype: int64
fold
0    131
1    131
2    131
3    131
4    130
dtype: int64
fold
0    131
1    131
2    131
3    131
4    130
dtype: int64
fold
0    131
1    131
2    131
3    131
4    130
dtype: int64
fold
0    131
1    131
2    131
3    131
4    130
dtype: int64


In [18]:
result["weights_mean"] = np.mean(weights, axis=0)

result.style.background_gradient(cmap="Blues")

Unnamed: 0,oof_logloss,oof_accuracy_score,weights_mean
inf_cv\ex11_pseudo3_inf_cv_seed0.csv,0.180886,0.671254,0.111055
inf_cv\ex11_pseudo3_inf_cv_seed1.csv,0.19539,0.671254,0.118242
inf_cv\ex11_pseudo3_inf_cv_seed2.csv,0.193809,0.666667,0.135804
inf_cv\ex15_pseudo3_inf_cv_seed0.csv,0.215463,0.674312,0.114106
inf_cv\ex15_pseudo3_inf_cv_seed1.csv,0.203645,0.657492,0.088051
inf_cv\ex15_pseudo3_inf_cv_seed2.csv,0.19501,0.66055,0.07556
inf_cv\ex16_pseudo3_inf_cv_seed0.csv,0.14244,0.674312,0.114028
inf_cv\ex16_pseudo3_inf_cv_seed1.csv,0.140975,0.675841,0.110804
inf_cv\ex16_pseudo3_inf_cv_seed2.csv,0.136883,0.668196,0.132349


In [19]:
result = pd.DataFrame(weights, columns=paths, index=Y.columns)
result["n_pos"] = Y.sum()
result["n_iter"] = n_iters

result.style.background_gradient(cmap="Blues", subset=paths, vmax=1.0, vmin=0.0)

Unnamed: 0,inf_cv\ex11_pseudo3_inf_cv_seed0.csv,inf_cv\ex11_pseudo3_inf_cv_seed1.csv,inf_cv\ex11_pseudo3_inf_cv_seed2.csv,inf_cv\ex15_pseudo3_inf_cv_seed0.csv,inf_cv\ex15_pseudo3_inf_cv_seed1.csv,inf_cv\ex15_pseudo3_inf_cv_seed2.csv,inf_cv\ex16_pseudo3_inf_cv_seed0.csv,inf_cv\ex16_pseudo3_inf_cv_seed1.csv,inf_cv\ex16_pseudo3_inf_cv_seed2.csv,n_pos,n_iter
0,0.121497,0.155134,0.066115,0.136007,0.055385,0.126127,0.143853,0.057052,0.138829,60,1664.76
1,0.134478,0.163761,0.113588,0.056707,0.069989,0.091013,0.119016,0.112862,0.138586,42,2417.84
2,0.117129,0.015286,0.012798,0.058015,0.134313,0.062337,0.220881,0.124416,0.254824,132,1123.88
3,0.070955,0.096027,0.110179,0.085234,0.101725,0.136823,0.120325,0.155638,0.123095,42,1753.56
4,0.131099,0.14644,0.064563,0.071895,0.070976,0.091567,0.089342,0.14073,0.193387,42,2637.48
5,0.047434,0.074851,0.073154,0.082142,0.107732,0.121495,0.238357,0.12462,0.130215,60,2353.96
6,0.059447,0.19252,0.281989,0.050019,0.057961,0.048639,0.122418,0.129458,0.05755,48,1975.24
7,0.212161,0.006568,0.281668,0.438853,0.013217,0.018261,0.015872,0.008283,0.005118,30,3915.92
8,0.04754,0.152484,0.227543,0.050636,0.158469,0.041463,0.081522,0.19721,0.043133,30,2523.04
9,0.051457,0.282973,0.24659,0.021964,0.030406,0.018541,0.024089,0.151842,0.172138,66,1318.64


In [20]:
score(Y[columns], Y_pred[columns])

0.11096312022075137

In [21]:
accuracy_score(Y.values.argmax(1), Y_pred[columns].values.argmax(1))

0.7385321100917431

# test set ensemble

In [22]:
test_df = pd.read_csv(INPUT_DIR + "/test.csv")

test_preds_list = []
for dirname, _, filenames in os.walk('inf_test'):
    for filename in filenames:
        print(filename)
        path_name = os.path.join(dirname, filename)
        df = pd.read_csv(path_name).sort_values(by="image_id")
        df = df.drop(["image_id", "label", "file_path"], axis=1)
        test_preds_list.append(df.values)

ex11_pseudo3_inf_test_seed0.csv
ex11_pseudo3_inf_test_seed1.csv
ex11_pseudo3_inf_test_seed2.csv
ex15_pseudo3_inf_test_seed0.csv
ex15_pseudo3_inf_test_seed1.csv
ex15_pseudo3_inf_test_seed2.csv
ex16_pseudo3_inf_test_seed0.csv
ex16_pseudo3_inf_test_seed1.csv
ex16_pseudo3_inf_test_seed2.csv


In [23]:
# 最適化した重み
ens_weights = result.T.values
test_preds_ensemble = np.zeros((test_df.shape[0], n_classes))
for i,pre in enumerate(test_preds_list):
    test_preds_ensemble += ens_weights[i] * pre
display(pd.DataFrame(test_preds_ensemble))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.017190,0.017666,0.448580,0.003807,0.002742,0.420865,0.025651,0.000433,0.001133,0.003544,0.007972,0.004829,0.000581
1,0.068430,0.072769,0.060694,0.074476,0.036962,0.010709,0.026166,0.001349,0.003505,0.014184,0.014574,0.618219,0.042146
2,0.001615,0.004640,0.029480,0.037478,0.013587,0.019290,0.006239,0.000247,0.008932,0.007998,0.861609,0.011441,0.003161
3,0.068842,0.015028,0.139522,0.407718,0.232576,0.017557,0.005274,0.001178,0.067313,0.015084,0.032214,0.007875,0.008257
4,0.114389,0.313993,0.053507,0.212881,0.035438,0.107267,0.022851,0.001142,0.002321,0.032111,0.008127,0.121943,0.011402
...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,0.025968,0.004363,0.523694,0.009305,0.061068,0.073599,0.001101,0.003981,0.047368,0.007706,0.043589,0.019282,0.038107
493,0.114327,0.008103,0.424579,0.001671,0.061769,0.148278,0.016071,0.000854,0.006304,0.117460,0.001015,0.010707,0.019582
494,0.000785,0.001508,0.004631,0.001142,0.001002,0.003154,0.003043,0.000231,0.982583,0.002540,0.001649,0.001611,0.001273
495,0.001101,0.000940,0.002756,0.000911,0.001093,0.001990,0.986178,0.000241,0.000932,0.002417,0.001100,0.000940,0.000517


In [24]:
test_df['label'] = test_preds_ensemble.argmax(1)

In [25]:
test_df = test_df.rename(columns={'image_id':"id", 'label':"y"})
test_df.to_csv(f'{NAME}_ens_weights_submission.csv', index=False)
display(test_df)

Unnamed: 0,id,y
0,0,2
1,1,11
2,2,10
3,3,3
4,4,1
...,...,...
492,492,2
493,493,2
494,494,8
495,495,6
