# LGB Parameter_SimpleVersion <section id="section_top" />

- [Loading Libraries](#section_LL)
- [Defining Loss](#section_DL)
- [Extracting Useful Features](#section_EUF)
- [Parameter_tuning](#section_pt)
- [Examination](#section_ex)
- [Convergence Plot](#section_CPlot)
- [Training LGB Classifier with tuned Parameters](#section_train)
- [Making Predictions](#section_pred)

---------------------


This kernel was created with reference to the following.  

ref.    
https://www.kaggle.com/meaninglesslives/lgb-parameter-tuning  
https://www.kaggle.com/ogrellier/plasticc-in-a-kernel-meta-and-data  
https://www.kaggle.com/ashishpatel26/can-this-make-sense-of-the-universe-tuned  

- In this kernel, passband and object_id are used as the grouping key.  
- aggs: Deleted except flux, flux_err   
- etc...


# Loading Libraries  <section id="section_LL" />

[return](#section_top)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import lightgbm as lgb

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_convergence
from skopt.plots import plot_objective, plot_evaluations
from skopt.utils import use_named_args
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import time
notebookstart= time.time()
pd.set_option("display.max_rows", 101)
isDataCheck=False

# Defining Loss <section id="section_DL" />

[return](#section_top)

In [None]:
def lgb_multi_weighted_logloss(y_true, y_preds):#use by eval_metric
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')
    
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1-1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set 
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos
    
    loss = - np.sum(y_w) / np.sum(class_arr)
    return 'wloss', loss, False

def multi_weighted_logloss(y_true, y_preds):
    """
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weight = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}
    if len(np.unique(y_true)) > 14:
        classes.append(99)
        class_weight[99] = 2
    y_p = y_preds
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1-1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set 
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weight[k] for k in sorted(class_weight.keys())])
    y_w = y_log_ones * class_arr / nb_pos
    
    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss

# Extracting Useful Features <section id="section_EUF" />

[return](#section_top)

In [None]:
#grouping(object_id,passband)
aggs = {
    'flux': ['min', 'max', 'mean', 'median', 'std'],
    'flux_err': ['median', 'std'],
} 

grp_col=['object_id','passband']#'object_id'    

In [None]:
%%time
gc.enable()

# train = pd.read_csv('../input/training_set.csv')
train = pd.read_csv('../input/training_set.csv',
                   dtype = {
                       'object_id':np.int32,
                       'mjd':np.float64,
                       'passband':np.int8,
                       'flux':np.float32,
                       'flux_err':np.float32,
                       'detected':np.int32})

# agg_train = train.groupby('object_id').agg(aggs)
agg_train = train.groupby(grp_col).agg(aggs)
new_columns = [k + '_' + agg for k in aggs.keys() for agg in aggs[k]]
agg_train.columns = new_columns
agg_train=pd.pivot_table(agg_train, index='object_id', columns='passband')
del train

display(agg_train.head(10))
print("gc.collect:",gc.collect())

In [None]:
%%time

# meta_train = pd.read_csv('../input/training_set_metadata.csv')
meta_train = pd.read_csv('../input/training_set_metadata.csv',
                         dtype = {
                             'object_id':np.int32,
                             'ra':np.float32,
                             'decl':np.float32,                 
                             'gal_l':np.float32,           
                             'gal_b':np.float32,           
                             'ddf':np.int8,#bool
                             'hostgal_specz':np.float32,         
                             'hostgal_photoz':np.float32,        
                             'hostgal_photoz_err':np.float32,    
                             'distmod':np.float32,          
                             'mwebv':np.float32,            
                             'target':np.int8})

display(meta_train.head())

full_train = agg_train.reset_index().merge(
    right=meta_train,
    how='outer',
    on='object_id'
)
full_train=full_train.drop( columns=[('object_id', '')])
# print(full_train.columns)

if 'target' in full_train:
    y = full_train['target']
    del full_train['target']
classes = sorted(y.unique())

# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weight = {
    c: 1 for c in classes
}
for c in [64, 15]:
    class_weight[c] = 2

print('Unique classes : ', classes)

In [None]:
del agg_train
print("gc.collect:",gc.collect())

In [None]:
%%time

isfillNaN=False#True

if 'object_id' in full_train:
    oof_df = full_train[['object_id']]
    del full_train['object_id'], full_train['hostgal_specz'],full_train['ddf']
    
if isfillNaN:    
    train_mean = full_train.mean(axis=0)
    full_train.fillna(train_mean, inplace=True)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
clfs = []
importances = pd.DataFrame()

# Parameter Tuning <section id="section_pt" />

[return](#section_top)

In [None]:
%%time
dim_learning_rate = Real(low=1e-6, high=1e-1, prior='log-uniform',name='learning_rate')
dim_estimators = Integer(low=800, high=2000,name='n_estimators')
dim_max_depth = Integer(low=3, high=6,name='max_depth')

dimensions = [dim_learning_rate,
              dim_estimators,
              dim_max_depth]

default_parameters = [0.03,1000,3]

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 14,
    'metric': 'multi_logloss',
    'subsample': .9,
    'colsample_bytree': .7,
    'reg_alpha': .01,#L1
    'reg_lambda': .02,#01,#L2
#     'num_leaves': 31,#63,# Add 2^(max_depth) > num_leaves warning
    'min_split_gain': 0.01,
    'min_child_weight': 10,
    'silent':True,
    'verbosity':-1,
}

In [None]:
%%time
def createModel(learning_rate,n_estimators,max_depth):       

    oof_preds = np.zeros((len(full_train), len(classes)))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_]
        val_x, val_y = full_train.iloc[val_], y.iloc[val_]

        clf = lgb.LGBMClassifier(**lgb_params,learning_rate=learning_rate,
                                n_estimators=n_estimators,max_depth=max_depth)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgb_multi_weighted_logloss,
            verbose=False,#True,
            early_stopping_rounds=50
        )
        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        print('fold',fold_+1,multi_weighted_logloss(val_y, clf.predict_proba(val_x, num_iteration=clf.best_iteration_)))

        clfs.append(clf)
    
    loss = multi_weighted_logloss(y_true=y, y_preds=oof_preds)
    print('MULTI WEIGHTED LOG LOSS : %.5f ' % loss)
    
    return loss

In [None]:
%%time
@use_named_args(dimensions=dimensions)
def fitness(learning_rate,n_estimators,max_depth):
    """
    Hyper-parameters:
    learning_rate:     Learning-rate for the optimizer.
    n_estimators:      Number of estimators.
    max_depth:         Maximum Depth of tree.
    """

    # Print the hyper-parameters.
    print('learning rate: {0:.2e}'.format(learning_rate))
    print('estimators:', n_estimators)
    print('max depth:', max_depth)
    
    lv= createModel(learning_rate=learning_rate,
                    n_estimators=n_estimators,
                    max_depth = max_depth)
    return lv

## Examination
<section id="section_ex" />

[return](#section_top)
    

In [None]:
%%time
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

----------------------------------------

In [None]:
%%time

isSearchForHyperparameters=False

if isSearchForHyperparameters:
    search_result = gp_minimize(func=fitness,
                                dimensions=dimensions,
                                acq_func='EI', 
                                n_calls=20,
                                x0=default_parameters,n_jobs=-1)

# Convergence Plot
<section id="section_CPlot" />

[return](#section_top)

In [None]:
if isSearchForHyperparameters:
    plot_convergence(search_result)
    plt.show()

# optimal parameters found using scikit optimize. use these parameter to initialize the 2nd level model.
if isSearchForHyperparameters:
    print(search_result.x)
    learning_rate = search_result.x[0]
    n_estimators = search_result.x[1]
    max_depth = search_result.x[2]
else:
    learning_rate = default_parameters[0]
    n_estimators = default_parameters[1]
    max_depth = default_parameters[2] 
print("learning_rate:",learning_rate)
print("n_estimators:",n_estimators)
print("max_depth:",max_depth)

In [None]:
if isSearchForHyperparameters:
    del search_result,plot_convergence

# Training LGB Classifier with tuned Parameters <section id="section_train" />

[return](#section_top)

In [None]:
%%time

folds = StratifiedKFold(n_splits=5, 
                        shuffle=True, random_state=1)
clfs = []
importances = pd.DataFrame()

oof_preds = np.zeros((len(full_train), len(classes)))
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_]
    val_x, val_y = full_train.iloc[val_], y.iloc[val_]
    
    clf = lgb.LGBMClassifier(
        **lgb_params,
        learning_rate=learning_rate,
        n_estimators=n_estimators,max_depth=max_depth)
    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=lgb_multi_weighted_logloss,
        verbose=100,
        early_stopping_rounds=50)
    oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
    print(multi_weighted_logloss(val_y, clf.predict_proba(val_x, num_iteration=clf.best_iteration_)))
    
    imp_df = pd.DataFrame()
    imp_df['feature'] = full_train.columns
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    clfs.append(clf)

print('MULTI WEIGHTED LOG LOSS : %.5f ' % multi_weighted_logloss(y_true=y, y_preds=oof_preds))


mean_gain = importances[['gain', 'feature']].groupby('feature').mean()
importances['mean_gain'] = importances['feature'].map(mean_gain['gain'])
# plt.figure(figsize=(8, 12))
# sns.barplot(x='gain', y='feature', data=importances.sort_values('mean_gain', ascending=False))
# plt.tight_layout()
# plt.savefig('importances.png')

In [None]:
importances.loc[:,['feature','mean_gain']].groupby(
    'feature').mean().sort_values('mean_gain',ascending=False)

In [None]:
# lgb.plot_tree(clf,figsize=(18,10))

In [None]:
importances.loc[importances.fold==1].sort_values('gain',ascending=False)

In [None]:
importances.loc[importances.fold==2].sort_values('gain',ascending=False)

In [None]:
importances.loc[importances.fold==3].sort_values('gain',ascending=False)

In [None]:
importances.loc[importances.fold==4].sort_values('gain',ascending=False)

In [None]:
importances.loc[importances.fold==5].sort_values('gain',ascending=False)

In [None]:
importances.loc[:,['feature','mean_gain']].groupby('feature').mean().sort_values('mean_gain',ascending=False)

In [None]:
del oof_preds,importances,mean_gain
print("gc.collect:",gc.collect())

# Making Predictions <section id="section_pred" />

[return](#section_top)

In [None]:
%%time
print("read test_set_metadata.csv")
# meta_test = pd.read_csv('../input/test_set_metadata.csv')
meta_test = pd.read_csv('../input/test_set_metadata.csv',
                        dtype = {'object_id':np.int32,
                                 'ra':np.float32,
                                 'decl':np.float32,                 
                                 'gal_l':np.float32,           
                                 'gal_b':np.float32,           
                                 'ddf':np.int8,#bool
                                 'hostgal_specz':np.float32,         
                                 'hostgal_photoz':np.float32,        
                                 'hostgal_photoz_err':np.float32,    
                                 'distmod':np.float32,          
                                 'mwebv':np.float32, } )

In [None]:
%%time

# isDebug=False

import time

start = time.time()
# chunks = 20_000_000
chunks = 5_000_000

preds_1 = 14
from tqdm import tqdm
print("read test_set.csv")
columnslist=full_train.columns
del full_train
print("gc.collect",gc.collect())
for i_c, df in enumerate(tqdm(pd.read_csv('../input/test_set.csv', 
                                     chunksize=chunks, iterator=True,
                                     dtype = {'object_id':np.int32,
                                              'mjd':np.float64,
                                              'passband':np.int8,
                                              'flux':np.float32,
                                              'flux_err':np.float32,
                                              'detected':np.int32}))):
    agg_test = df.groupby(grp_col).agg(aggs)
    agg_test.columns = new_columns 
    agg_test=pd.pivot_table(agg_test, index='object_id', columns='passband')#.reset_index()
    full_test = agg_test.reset_index().merge(right=meta_test, how='left', on='object_id')
    if isfillNaN:
        full_test = full_test.fillna(train_mean)

    # Make predictions
    preds = None
    for clf in clfs:
        if preds is None:
#             preds = clf.predict_proba(full_test[full_train.columns]) / folds.n_splits
            preds = clf.predict_proba(full_test[columnslist]) / folds.n_splits
        else:
#             preds += clf.predict_proba(full_test[full_train.columns]) / folds.n_splits
            preds += clf.predict_proba(full_test[columnslist]) / folds.n_splits

    # preds_99 = 0.1 gives 1.769
    preds_99 = np.ones(preds.shape[0])
    #     for i in range(preds.shape[1]):
    #         preds_99 *= (1 - preds[:, i])
    #     preds_1 = preds.shape[1]
    for i in range(preds_1):
        preds_99 *= (1 - preds[:, i])

    # Store predictions
    preds_df = pd.DataFrame(preds, columns=['class_' + str(s) for s in clfs[0].classes_])
    preds_df['object_id'] = full_test['object_id']
    #     https://www.kaggle.com/ogrellier/plasticc-in-a-kernel-meta-and-data/code
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/68943
    #     preds_df['class_99'] = preds_99
    preds_df['class_99'] = 0.14 * preds_99 / np.mean(preds_99) 
#     if isDebug:
#         print(preds_df['class_99'].mean(),np.mean(preds_99))
#         print(np.mean(0.14 * preds_99))
    
    if i_c == 0:
        preds_df.to_csv('predictions.csv',  header=True, mode='a', index=False)
    else: 
        preds_df.to_csv('predictions.csv',  header=False, mode='a', index=False)
        
    del agg_test, full_test, preds_df, preds
    gc.collect()
    
    if (i_c + 1) % 10 == 0:
        print('%15d done in %5.1f' % (chunks * (i_c + 1), (time.time() - start) / 60))

In [None]:
%%time
z = pd.read_csv('predictions.csv')

print(z.groupby('object_id').size().max())
print((z.groupby('object_id').size() > 1).sum())

z = z.groupby('object_id').mean()

z.to_csv('single_predictions.csv', index=True)

In [None]:
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))