In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime
from functools import wraps
from contextlib import contextmanager
from tqdm import tqdm


@contextmanager
def timer(msg):
    st = datetime.now()
    yield
    cost = datetime.now() - st
    print(f'{msg} Done. It cost {cost}')


def clock(func):
    @wraps(func)
    def clocked(*args, **kwargs):
        st = datetime.now()
        res = func(*args, **kwargs)
        cost_ = datetime.now() - st
        print(f'{func.__name__} cost {cost_}')
        return res
    return clocked


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# inference: https://www.kaggle.com/fusioncenter/residual-network-for-tabular-data

# Load data

In [None]:
with timer('drop dumpliced'):
    train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/train.csv')
    test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/test.csv')
    submit_df = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv')

## drop dumplicates rows

In [None]:
need_columns = [i for i in train_df.columns if 'feat' in i] + ['target']
with timer('drop duplicate'):
    print('Before drop duplicate train_df.shape:', train_df.shape)
    train_df = train_df.drop_duplicates(subset=need_columns).reset_index(drop=True)
    print('After drop duplicate train_df.shape:', train_df.shape)

## Aggregate low freq categries 

In [None]:
@clock
def low_freqence_detector(df, vars_to_agg, agg_threshold=0.999):
    """
    将低频值归为一类
    """
    replace_dict = {}
    for col in tqdm(vars_to_agg):
        a_cumsum = df[col].value_counts(normalize=True).cumsum()
        value_count_series = df[col].value_counts()
        will_be_replaced_values = value_count_series[a_cumsum >= agg_threshold].index.tolist()
        n = len(will_be_replaced_values)
        replace_value = min(will_be_replaced_values)
        tmp_dict = (will_be_replaced_values, replace_value)
        replace_dict[col] = tmp_dict
    return replace_dict

def aggregate_low_freq_values(df, replace_dict):
    df_out = df.copy(deep=True)
    for replace_feat in tqdm(replace_dict):
        need_replaced_values = replace_dict[replace_feat][0]
        replace_value = replace_dict[replace_feat][1]
        df_out.loc[df[replace_feat].isin(need_replaced_values), replace_feat] = replace_value
    return df_out

def quick_agg_low_freq_values(tr_df, te_df, vars_to_agg, agg_threshold=0.999):
    c = pd.concat([tr_df[vars_to_agg], te_df[vars_to_agg]], ignore_index=True)
    replace_dict = low_freqence_detector(c, vars_to_agg, agg_threshold)
    return aggregate_low_freq_values(tr_df, replace_dict), aggregate_low_freq_values(te_df, replace_dict)

In [None]:
train_df, test_df = quick_agg_low_freq_values(
    train_df, test_df,
    [i for i in train_df.columns if 'feat' in i]
)

In [None]:
# # 增加freq
# frequnce_add_col = ['feature_60', 'feature_15', 'feature_28', 'feature_61', 'feature_62']
# for col_ in tqdm(frequnce_add_col):
#     dict_ = train_df[col_].value_counts(normalize=True).to_dict()
#     train_df[f'{col_}_freq'] = train_df[col_].map(dict_)
#     test_df[f'{col_}_freq'] = test_df[col_].map(dict_)

# residual net model
## residual blocks

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Input, SpatialDropout1D, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.layers import concatenate, Dropout, Flatten, BatchNormalization
from tensorflow.keras import Model, backend, layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam, Adamax
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
def tf2_logloss(obs, pre):
    pre = tf.clip_by_value(pre, 0.0005, 1-0.0005)
    return tf.keras.metrics.categorical_crossentropy(obs, pre)

def plot_heatmap(y_true, y_pred_prob):
    y_pred = np.argmax(y_pred_prob, axis=1)
    conf = confusion_matrix(y_true, y_pred)
    conf_p = conf/ conf.sum(axis=0).reshape(1, -1)
    conf_r = conf/ conf.sum(axis=1).reshape(-1, 1)
    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    sns.heatmap(conf_p, annot=True, fmt='.2f', ax=axes[0])
    axes[0].set_title('Percision')
    sns.heatmap(conf_r, annot=True, fmt='.2f', ax=axes[1])
    axes[1].set_title('Recall')
    plt.show()


def embedding_residul_block(
    max_cnt, 
    embed_size=3, 
    feature_nums=75,
    max_len=1, 
    number_of_blocks=4,
    output_shape=9
):
    _input = Input(shape=(feature_nums,), dtype='float32')
    _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=False)(_input)
    _embed = Flatten()(_embed)
    
    block_list = []
    for i in range(number_of_blocks):
        if i == 0:
            blocki = BatchNormalization()(_input)
            blocki = Dropout(0.2)(blocki)
            blocki = Dense(64, activation='relu')(blocki)
            blocki = concatenate([blocki, _embed])
            block_list.append(blocki)
            continue

        blocki_bf = block_list[i-1]
        blocki_next = BatchNormalization()(blocki_bf)
        blocki_next = Dropout(0.2)(blocki_next)
        blocki_next = Dense(128, activation='relu')(blocki_next)
        blocki_next = concatenate([blocki_next, block_list[0]])
        block_list.append(blocki_next)

    _output = block_list[-1]
    _output = BatchNormalization()(_output)
    _output = Dense(output_shape, activation='softmax')(_output)

    model = Model(inputs=_input, outputs = _output)
    
    model.compile(loss = 'categorical_crossentropy',
                  optimizer=Adam(lr = 0.01),
                  metrics=['accuracy']) #[tf2_logloss]) #
    return model

## model train

In [None]:
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import f1_score, log_loss
from copy import deepcopy


model_save_path='residual_nn.hdf5'
early_stopping = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.00001)
model_checkpoint = ModelCheckpoint(
    model_save_path,
    save_best_only=True,
    save_weights_only=True,
)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.01, patience=5)

def pred_57_f1(y_true_in, y_pred_proba_in):
    y_true = deepcopy(y_true_in)
    y_pred_proba = deepcopy(y_pred_proba_in)
    mean_5 = np.percentile(y_pred_proba[:, 5], 65)
    mean_7 = np.percentile(y_pred_proba[:, 7], 65)
    y_pred_proba[ y_pred_proba[:, 5] <= mean_5, 5] = 0.0001
    y_pred_proba[ y_pred_proba[:, 7] <= mean_7, 7] = 0.0001
    y_pred = np.argmax(y_pred_proba, axis=1)
    y_pred57 = (y_pred == 5) | (y_pred == 7) 
    y_pred[~y_pred57]=0
    
    y_true57 = (y_true == 5) | (y_true == 7) 
    y_true[~y_true57]=0
    
    return f1_score(y_true, y_pred, average='macro'), y_pred_proba, log_loss(y_true_in, y_pred_proba )


In [None]:
print(tf.config.list_physical_devices())
for i in tf.config.list_physical_devices():
    if 'GPU' in i[1]:
        print (i[1],'可用,GPU名称: ',i[0])
        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
        print ('Turn on GPU')

In [None]:
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.isotonic import IsotonicRegression
lb = LabelEncoder()
# print(train_df.columns)
X = train_df.drop(columns = ['id', 'target']).values
y = lb.fit_transform(train_df['target'].values)
test_array = test_df.drop(columns = ['id']).values
nfold = 5
epochs = 50
output_shape = 9
# kf = StratifiedKFold(nfold)
pred_arr_list = []
for seed_ in [2021, 42, 1921]:
    tf.random.set_seed(seed_)
    kf = StratifiedKFold(nfold, shuffle=True, random_state=seed_)

    for foldi, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Fold: {foldi} | seed{seed_}")
        tr_x, tr_y = X[tr_idx], y[tr_idx]
        val_x, val_y = X[val_idx], y[val_idx]
        nn_model = embedding_residul_block(
            max_cnt=500, # 500
            embed_size=3,#3, 
            feature_nums=tr_x.shape[1],
            max_len=1, 
            number_of_blocks=4,#4,
            output_shape=9
        )
    #     nn_model.summary()
        nn_model.fit(tr_x, to_categorical(tr_y, output_shape),
                         validation_data = (val_x, to_categorical(val_y, output_shape)),
                         epochs = epochs,
                         batch_size = 128, #128,
                         shuffle=True,
                         callbacks = [early_stopping, model_checkpoint, reduce_lr]
                    )
        # load best model
        nn_model.load_weights(model_save_path)
        pred = nn_model.predict(val_x)
        pred_cp = deepcopy(pred)
        for i in range(pred.shape[1]):
            ir = IsotonicRegression()
            ir.fit(pred[:, i], to_categorical(val_y, output_shape)[:, i])
            pred_cp[:, i] = ir.predict(pred[:, i])

        ir_loss = log_loss(val_y, pred_cp )
        loss_o = log_loss(val_y, pred )
        print(f'original loss: {loss_o:.3f}, ir_loss:{ir_loss:.3f}')
        plot_heatmap(val_y, pred_cp)
        if foldi == 0:
            pred_out = nn_model.predict(test_array)
            pred_array = pred_out
        else:
            pred_out = nn_model.predict(test_array)
            pred_array += pred_out
        pred_arr_list.append(pred_out)
        print("-"*50)

# Model Merge

In [None]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from mlxtend.preprocessing import minmax_scaling

def make_mi_scores(X, y, discrete_features):
    """
    https://www.kaggle.com/mehrankazeminia/1-tps-jun-21-histgradient-catboost-nn/output
    估计离散目标变量的互信息
    """
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features, random_state=123)    
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)                         
    mi_scores = mi_scores.sort_values(ascending=False)                          
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

## Model0 - tabluar-residual-nn

In [None]:
n = 1
for arr_ in pred_arr_list:
    if n ==1:
        pred_f = arr_
        n += 1
        continue
    pred_f += arr_
    n += 1

pred_f = pred_f/n

submit_df0 = submit_df.copy(deep=True)
submit_df0.loc[:, ['Class_1','Class_2', 'Class_3', 'Class_4','Class_5','Class_6', 'Class_7', 'Class_8', 'Class_9']] =\
    np.clip(pred_f, 10**-15, 1-10**-15)

## Model1- LGBMClassifier

In [None]:
from lightgbm import LGBMClassifier
lgb_params = {
        'num_leaves': 10,
        'min_data_in_leaf': 63,
        'learning_rate': 0.05,
        'min_sum_hessian_in_leaf': 8.140308692805194,
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'boost_from_average':'false',
        'subsample': 0.749948437333368,
        'colsample_bytree': 0.6168504947710284,
         'reg_alpha': 0.227796749807186,
         'reg_lambda': 70.2792417704872,
        'min_gain_to_split': 0.4758826409257615,
        'max_depth': 14, 
        'n_jobs': -1,
        'boosting_type': 'gbdt',
        'metric':'multi_logloss',
#         'early_stopping_round' : 100,
        'n_estimators': 500,
        'tree_learner': 'serial',
    }
model1 = LGBMClassifier(**lgb_params)

model1.fit(X, y, verbose=50)
pred1 = model1.predict_proba(test_array)
submit_df1 = submit_df.copy(deep=True)
submit_df1.loc[:, ['Class_1','Class_2', 'Class_3', 'Class_4','Class_5','Class_6', 'Class_7', 'Class_8', 'Class_9']] =\
    np.clip(pred1, 10**-15, 1-10**-15)

## Model2 - CatBoostClassifier

In [None]:
from catboost import CatBoostClassifier
model2 = CatBoostClassifier(depth=8,
                            iterations=1000,
                            learning_rate=0.02,                            
                            eval_metric='MultiClass',
                            loss_function='MultiClass', 
                            bootstrap_type= 'Bernoulli',
                            leaf_estimation_method='Gradient',
                            random_state=123,
                            task_type='GPU')   

model2.fit(X, y, verbose=100)
pred2 = model2.predict_proba(test_array)
submit_df2 = submit_df.copy(deep=True)
submit_df2.loc[:, ['Class_1','Class_2', 'Class_3', 'Class_4','Class_5','Class_6', 'Class_7', 'Class_8', 'Class_9']] =\
    np.clip(pred2, 10**-15, 1-10**-15)

In [None]:
def generate(main, support, coeff):
    g = main.copy()    
    for i in main.columns[1:]:
        
        res = []
        lm, Is = [], []        
        lm = main[i].tolist()
        ls = support[i].tolist()  
        
        for j in range(len(main)):
            res.append((lm[j] * coeff) + (ls[j] * (1.- coeff)))            
        g[i] = res
        
    return g


sub = generate(submit_df1, submit_df2, 0.85)
sub = generate(submit_df0, sub , 0.85)

# Stacking

In [None]:
class SklearnHelper:
    def __init__(self, clf, seed, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_tr, y_tr):
        self.clf.fit(x_tr, y_tr)
    
    def predict(self, x):
        try:
            pred = self.predict_proba(x)
        except AttributeError:
            pred = self.clf.predict(x)
        return pred
    
    def predict_proba(self, x):
        return self.clf.predict_proba(x)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
    def __repr__(self):
        return str(self.clf)

        
from sklearn.model_selection import KFold
ntrain = train_df.shape[0]
ntest = test_df.shape[0]
NFOLDS = 5
SEED = 2021
kf = KFold(n_splits= NFOLDS, shuffle=True ,random_state=42)

def get_oof(clf, x_train, y_train, x_test, class_nums):
    oof_train = np.zeros((ntrain, class_nums))
    oof_test = np.zeros((ntest, class_nums))
    oof_test_skf = np.zeros((ntest, class_nums))

    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        y_te = y_train[test_index]

        clf.train(x_tr, y_tr)
        
        pred = clf.predict(x_te)
        oof_train[test_index] = pred
        oof_test_skf += clf.predict(x_test)
        
        loss = log_loss(y_te, pred)
        print(f"{str(clf)} | fold {i} | Log loss: {loss}")
        print("-"*50)

    oof_test = oof_test_skf / NFOLDS
    return oof_train, oof_test

## Stacking models

In [None]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
# lgb
lgb_params = {
        'num_leaves': 10,
        'min_data_in_leaf': 63,
        'learning_rate': 0.05,
        'min_sum_hessian_in_leaf': 8.140308692805194,
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'boost_from_average':'false',
        'subsample': 0.749948437333368,
        'colsample_bytree': 0.6168504947710284,
         'reg_alpha': 0.227796749807186,
         'reg_lambda': 70.2792417704872,
        'min_gain_to_split': 0.4758826409257615,
        'max_depth': 14, 
        'n_jobs': -1,
        'boosting_type': 'gbdt',
        'metric':'multi_logloss',
#         'early_stopping_round' : 100,
        'n_estimators': 500,
        'tree_learner': 'serial',
        'verbose': 0
    }

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}


# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}


# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}


# catboost
catboost_param = dict(depth=8,
iterations=1000,
learning_rate=0.02,                            
eval_metric='MultiClass',
loss_function='MultiClass', 
bootstrap_type= 'Bernoulli',
leaf_estimation_method='Gradient',
random_state=123,
task_type='GPU')


In [None]:
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gbm = SklearnHelper(clf=LGBMClassifier, seed=SEED, params=lgb_params)
ctb = SklearnHelper(clf=CatBoostClassifier, seed=SEED, params=catboost_param)

In [None]:

rf_oof_train, rf_oof_test = get_oof(rf,X, y, test_array, class_nums=9) # Random Forest
# ada_oof_train, ada_oof_test = get_oof(ada, X, y, test_array, class_nums=9) # AdaBoost 
lgb_oof_train, lgb_oof_test = get_oof(gbm,X, y, test_array, class_nums=9) # lgb
et_oof_train, et_oof_test = get_oof(et, X, y, test_array, class_nums=9) # Extra Trees
ctb_oof_train, ctb_oof_test = get_oof(ctb,X, y, test_array, class_nums=9) # catboost

# Focus 
class6(5) - class8(7)


In [None]:
# #### view
# import scipy.stats as sts
# view_foucs5_bool = (train_df.target == 'Class_6') 
# view_foucs7_bool =(train_df.target == 'Class_8')

# skew_5_7_diff_beyond_05 = []
# for i in [i for i in train_df.columns if 'feat' in i]:
#     print(i)
#     train_df.loc[view_foucs5_bool & (~train_df[i].isin([0, 1, 2, 3])), i].hist()
#     skew_5 = sts.skew(train_df.loc[view_foucs5_bool & (~train_df[i].isin([0, 1, 2, 3])), i])
#     plt.title(f'{i}&class_5 total_mean: {train_df.loc[view_foucs5_bool, i].mean():.3f} \
#     | limit> 3 mean: {train_df.loc[view_foucs5_bool & (~train_df[i].isin([0, 1, 2, 3])), i].mean():.3f} | \
#     skew : {skew_5:.3f}')
#     plt.show()
#     train_df.loc[view_foucs7_bool & (~train_df[i].isin([0, 1, 2, 3])), i].hist()
#     skew_7 = sts.skew(train_df.loc[view_foucs7_bool & (~train_df[i].isin([0, 1, 2, 3])), i])
#     plt.title(f'{i}&class_7 total_mean: {train_df.loc[view_foucs7_bool, i].mean():.3f} \
#     | limit> 3 mean: {train_df.loc[view_foucs7_bool & (~train_df[i].isin([0, 1, 2, 3])), i].mean():.3f}| \
#     skew : {skew_7:.3f}')

#     plt.show()
#     if abs(skew_7-skew_5) > 0.5 :
#         print(f'abs(skew_7-skew_5) > 0.5: {abs(skew_7-skew_5):.2f}')
#         skew_5_7_diff_beyond_05.append(i)
#     print('--'*25)

In [None]:
# skew_5_7_diff_beyond_05
# # nd_cols = [i for i in train_df.columns if 'feat' in i]

In [None]:
# from sklearn.preprocessing import LabelEncoder
# from sklearn.linear_model import LogisticRegression
# from lightgbm import LGBMClassifier
# import matplotlib.pyplot as plt

# lb = LabelEncoder()
# X = train_df.loc[view_foucs5_bool | view_foucs7_bool, nd_cols].values
# y = lb.fit_transform(train_df.loc[view_foucs5_bool | view_foucs7_bool, 'target'].values)
# test_array = train_df.loc[view_foucs5_bool | view_foucs7_bool, nd_cols].values
# nfold = 5
# epochs = 50
# output_shape = 2
# kf = StratifiedKFold(nfold)
# for foldi, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
#     print(f"Fold: {foldi}")
#     tr_x, tr_y = X[tr_idx], y[tr_idx]
#     val_x, val_y = X[val_idx], y[val_idx]
#     lr = LGBMClassifier(is_unbalance=True)
#     lr.fit(tr_x, tr_y)
#     pred = lr.predict_proba(val_x)
#     plot_heatmap(val_y, pred)
#     if foldi == 0:
#         pred_array = lr.predict_proba(test_array)
#     else:
#         pred_array += lr.predict_proba(test_array)
#     print("-"*50)

# Submit

In [None]:
from datetime import datetime
now_ = datetime.now().strftime('%Y%m%d_%H_%M')
# submit_df.loc[:, ['Class_1','Class_2', 'Class_3', 'Class_4','Class_5','Class_6', 'Class_7', 'Class_8', 'Class_9']] =\
#     np.clip(pred_f, 10**-15, 1-10**-15)


# submit_df = submit_df.fillna(0.0001)
display(submit_df0.head())
submit_df0.to_csv(f'residul_nn_model_{now_}.csv',index=False)

In [None]:
os.environ['KAGGLE_USERNAME'] = "scchuy" # username from the json file 
os.environ['KAGGLE_KEY'] = "59c271f7739fbc0d21b8d2c5f8789670"
!kaggle competitions submit -c tabular-playground-series-jun-2021 -f ./residul_nn_model_{now_}.csv -m "Message"