# Download Data And Basic Setting 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
print(os.getcwd())


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !kaggle competitions download -c tabular-playground-series-jun-2021
# !unzip ./tabular-playground-series-jun-2021.zip -d /kaggle/working/input/tabular_202106data

In [None]:
file_path = '/kaggle/input/tabular-playground-series-jun-2021'
os.listdir(file_path)

# Loading Data

In [None]:
import missingno as missno
import matplotlib.pyplot as plt
import seaborn as sns
from  contextlib import contextmanager
import time
from datetime import datetime
from functools import wraps
pd.set_option('display.max_rows', 200)

@contextmanager
def timer(msg):
    st = datetime.now()
    yield
    cost = datetime.now() - st
    print(f'{msg} Done. It cost {cost}')

    
def clock(func):
    @wraps(func)
    def clocked(*args, **kwargs):
        st = datetime.now()
        res = func(*args, **kwargs)
        cost_ = datetime.now() - st
        print(f'{func.__name__} cost {cost_}')
        return res
    return clocked

In [None]:
train_df = pd.read_csv(os.path.join(file_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(file_path, 'test.csv'))
sub_df = pd.read_csv(os.path.join(file_path, 'sample_submission.csv'))

need_columns = [col for col in train_df.columns if 'feature' in col] + ['target']

# Data Explore (simple) 

In [None]:
## missing
missno.matrix(train_df[need_columns], figsize=(12, 6))
plt.show()

In [None]:
## target 
plt.figure(figsize=(16,8))
train_df['target'].value_counts().plot.pie(autopct='%3.1f%%', pctdistance=0.7, wedgeprops=dict(linewidth=2,width=0.5,edgecolor='w'))
# help(train_df[need_columns].describe().T.style)

In [None]:
## data overview
def zero_rate(series_):
    return np.mean(series_ == 0)

with timer('data_over_view'):
    overview_columns = [i for i in need_columns if 'feat' in i]
    desc_df = train_df[overview_columns].describe().T
    nunique_df = pd.DataFrame(train_df[overview_columns].nunique()).rename(columns = {0: 'nunique'})
    
    train_df['gp_fake'] = '1'
    zero_df = train_df[overview_columns+['gp_fake']].groupby('gp_fake').agg(zero_rate).T.rename(columns = {'1': 'zero_rate'})
    range_df = train_df[overview_columns+['gp_fake']].groupby('gp_fake').agg(np.ptp).T.rename(columns = {'1': 'max_min_range'})
    train_df.drop(columns='gp_fake', inplace=True)
    exp_df = desc_df.merge(nunique_df, left_index=True, right_index=True)\
                    .merge(range_df, left_index=True, right_index=True)\
                    .merge(zero_df, left_index=True, right_index=True)
    
    del nunique_df, desc_df, range_df, zero_df
    display(exp_df.style.format('{:.2f}',subset=['mean', 'std', 'zero_rate']).bar('std',vmin=0)\
                    .highlight_max('nunique').highlight_min('nunique')\
                    .background_gradient('Greens',subset='max_min_range')\
                    .bar('zero_rate', vmin=0, vmax=1).highlight_max('zero_rate')
           )

In [None]:
# corr
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
train_df['target_encode'] = lb.fit_transform(train_df['target'])

need_columns = overview_columns + ['target_encode']

with timer('Get corr df'):
    corr_df = train_df[need_columns+['target']].corr(method='spearman')

with timer('Get mask matrix'):
    mask = np.triu(np.ones_like(corr_df))
    mask_small = (mask + (np.abs(corr_df.values) > 0.01)).astype(bool)
    mask_big = (mask + (np.abs(corr_df.values) < 0.2)).astype(bool)

In [None]:
plt.figure(figsize=(16, 12))
sns.heatmap(corr_df, mask=mask_small) #, annot=True, fmt='.3f')
plt.show()


plt.figure(figsize=(16, 12))
sns.heatmap(corr_df, mask=mask_big) #, annot=True, fmt='.2f')
plt.show()


plt.figure(figsize=(16, 8))
corr_df.iloc[:-1, -1].sort_values().plot.bar()
plt.title('target & featues')

feature 17 low corr & min nunique & max zero rate

we may drop the features

## test vs train

In [None]:
def class_unique_plot(df, columns, traget, axe):
    if len(traget) == 0:
        df[columns].nunique().plot(ax=axe)
        plt.show()
        return None

    gp = df[[traget]+columns].groupby(traget)
    for name, tmp_df in gp:
        tmp_df.nunique().plot(ax=axe, label=name)
    
    plt.legend()
    plt.show()
    
    
fig, axes = plt.subplots(figsize=(18, 4))
class_unique_plot(train_df, overview_columns, 'target', axe=axes)

fig, axes = plt.subplots(figsize=(18, 4))
class_unique_plot(train_df, overview_columns, '', axe=axes)

fig, axes = plt.subplots(figsize=(18, 4))
class_unique_plot(test_df, overview_columns, '', axe=axes)

In [None]:
fig, axes = plt.subplots(figsize=(18, 4))
class_unique_plot(train_df.loc[train_df.target_encode.isin([5, 7]), :], overview_columns, 'target', axe=axes)


In [None]:
a = pd.DataFrame(train_df.loc[train_df.target_encode.isin([5]), :].nunique())\
.merge(pd.DataFrame(train_df.loc[train_df.target_encode.isin([7]), :].nunique()), left_index=True, right_index=True)
a['diff'] =  a['0_y'] - a['0_x']
display(a.style.bar('diff',vmin=0))

frequnce_add_col = ['feature_60', 'feature_15', 'feature_28', 'feature_61', 'feature_62']

# Generator Features

In [None]:
from tqdm import tqdm
@clock
def low_freqence_detector(df, vars_to_agg, agg_threshold=0.999):
    """
    将低频值归为一类
    """
    replace_dict = {}
    for col in tqdm(vars_to_agg):
        a_cumsum = df[col].value_counts(normalize=True).cumsum()
        value_count_series = df[col].value_counts()
        will_be_replaced_values = value_count_series[a_cumsum >= agg_threshold].index.tolist()
        n = len(will_be_replaced_values)
        replace_value = min(will_be_replaced_values)
        tmp_dict = (will_be_replaced_values, replace_value)
        replace_dict[col] = tmp_dict
    return replace_dict

def aggregate_low_freq_values(df, replace_dict):
    df_out = df.copy(deep=True)
    for replace_feat in tqdm(replace_dict):
        need_replaced_values = replace_dict[replace_feat][0]
        replace_value = replace_dict[replace_feat][1]
        df_out.loc[df[replace_feat].isin(need_replaced_values), replace_feat] = replace_value
    return df_out

def quick_agg_low_freq_values(tr_df, te_df, vars_to_agg, agg_threshold=0.999):
    c = pd.concat([tr_df[vars_to_agg], te_df[vars_to_agg]], ignore_index=True)
    replace_dict = low_freqence_detector(c, vars_to_agg, agg_threshold)
    return aggregate_low_freq_values(tr_df, replace_dict), aggregate_low_freq_values(te_df, replace_dict)

In [None]:
overview_columns = [i for i in train_df.columns if 'feat' in i]
train_df, test_df = quick_agg_low_freq_values(
    train_df, test_df,
    overview_columns
)

# for col_ in tqdm(frequnce_add_col):
#     dict_ = train_df[col_].value_counts(normalize=True).to_dict()
#     train_df[f'{col_}_freq'] = train_df[col_].map(dict_)
#     test_df[f'{col_}_freq'] = test_df[col_].map(dict_)




# 5fold - LGB

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss, confusion_matrix
from lightgbm import LGBMClassifier


def plot_heatmap(y_true, y_pred_prob):
    y_pred = np.argmax(y_pred_prob, axis=1)
    conf = confusion_matrix(y_true, y_pred)
    conf = conf/ conf.sum(axis=1)
    sns.heatmap(conf, annot=True, fmt='.2f')
    plt.show()

@clock
def cross_val(X, y, model, params, folds=5):
    models = []
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=50)

        pred = alg.predict_proba(x_test)
        loss = log_loss(y_test, pred)
        models.append((alg, loss))
        plot_heatmap(y_test, pred)
        print(f"Log loss: {loss}")
        print("-"*50)
    return models

In [None]:
# lgb_params = {
#     'n_estimators': 1500, 
#      'max_depth': 8, 
#      'num_leaves': 25, 
#      'learning_rate': 0.05, 
#      'reg_lambda': 28,
#      'subsample': 0.85, 
#      'colsample_bytree': 0.75, 
#       'n_jobs':-1
# }
# def f1_metric(preds, train_data):
#     labels = train_data.get_label()
#     return 'f1', f1_score(labels, preds, average='marco'), True

# lgb_params = { # 1.75060
#         'num_leaves': 14,
#         'min_data_in_leaf': 52,
#         'learning_rate': 0.04,
#         'min_sum_hessian_in_leaf': 10.090025079493055,
#         'bagging_fraction': 1.0,
#         'bagging_freq': 5,
#         'boost_from_average':'false',
# #         'subsample': 0.6798695128633439,
#         'colsample_bytree': 0.7727780074568463,
#         'reg_alpha': 0.45606998421703593,
#         'reg_lambda': 78.51759613930136,
#         'min_gain_to_split': 0.13949386065204183,
#         'max_depth': 6, 
#         'n_jobs': -1,
#         'boosting_type': 'gbdt',
#         'metric':'multi_logloss',
#         'early_stopping_round' : 100,
#         'n_estimators': 500,
#         'tree_learner': 'serial',
#     }

lgb_params = {
        'num_leaves': 10,
        'min_data_in_leaf': 63,
        'learning_rate': 0.05,
        'min_sum_hessian_in_leaf': 8.140308692805194,
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'boost_from_average':'false',
        'subsample': 0.749948437333368,
        'colsample_bytree': 0.6168504947710284,
         'reg_alpha': 0.227796749807186,
         'reg_lambda': 70.2792417704872,
        'min_gain_to_split': 0.4758826409257615,
        'max_depth': 14, 
        'n_jobs': -1,
        'boosting_type': 'gbdt',
        'metric':'multi_logloss',
        'early_stopping_round' : 100,
        'n_estimators': 500,
        'tree_learner': 'serial',
    }


X = train_df[overview_columns]
y = train_df['target_encode']

lgb_models = cross_val(X, y, LGBMClassifier, lgb_params)

# Submit

In [None]:
# os.makedirs('./submit_data')
os.listdir(); os.getcwd()

In [None]:
def models_merge(models_loss, pred_df):
    n = 0
    loss_d_sum = 0
    for m, l in tqdm(models_loss, desc='MODEL PREDICT：'):
        loss_d_sum += 1 / l
        if n == 0:
            pred_res = m.predict_proba(pred_df) * 1/l
            n += 1
            continue
        n += 1
        pred_res += m.predict_proba(pred_df) * 1/l
    return pred_res / loss_d_sum

In [None]:
with timer('submit data'):
    pred = models_merge(lgb_models[1:], test_df[overview_columns])
    now_ = datetime.now().strftime('%Y%m%d_%H_%M')
    sub_df.loc[:, ['Class_1','Class_2', 'Class_3', 'Class_4','Class_5','Class_6', 'Class_7', 'Class_8', 'Class_9']] = np.clip(pred, 10**-15, 1-10**-15)
    display(sub_df.head())
    sub_df.to_csv(f'lgb_model_{now_}.csv',index=False)

In [None]:
os.listdir(), os.getcwd(), now_

In [None]:
# !rm ./submit_data/lgb_model_20210607_20_03.csv

In [None]:
# os.environ['KAGGLE_USERNAME'] = "scchuy" # username from the json file 
# os.environ['KAGGLE_KEY'] = ".." #
# !kaggle competitions submit -c tabular-playground-series-jun-2021 -f ./lgb_model_20210609_18_18.csv -m "Message"