<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Enviroment" data-toc-modified-id="Enviroment-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Enviroment</a></span><ul class="toc-item"><li><span><a href="#OS" data-toc-modified-id="OS-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>OS</a></span></li><li><span><a href="#Library" data-toc-modified-id="Library-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Library</a></span></li></ul></li><li><span><a href="#Import-Module" data-toc-modified-id="Import-Module-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import Module</a></span></li><li><span><a href="#Read-Data" data-toc-modified-id="Read-Data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Read Data</a></span></li><li><span><a href="#Feature-Engineering" data-toc-modified-id="Feature-Engineering-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Feature Engineering</a></span></li><li><span><a href="#Modeling" data-toc-modified-id="Modeling-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Modeling</a></span><ul class="toc-item"><li><span><a href="#LIGHTGBM" data-toc-modified-id="LIGHTGBM-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>LIGHTGBM</a></span></li><li><span><a href="#CATBOOST" data-toc-modified-id="CATBOOST-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>CATBOOST</a></span></li><li><span><a href="#XGBOOST" data-toc-modified-id="XGBOOST-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>XGBOOST</a></span></li></ul></li><li><span><a href="#Weight-Ensemble" data-toc-modified-id="Weight-Ensemble-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Weight Ensemble</a></span></li><li><span><a href="#Submit" data-toc-modified-id="Submit-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Submit</a></span></li></ul></div>

# Enviroment

## OS

**`OS`** : Window10 Pro

**`CPU`** : Ryzen 3600

**`RAM`** : 32GB

**`GPU`** : RTX 2070 Super

## Library

**`Python`** : Version 3.74

- **`pandas`** : 0.25.1

- **`numpy`**: 1.16.5

- **`scikit-learn`**: 0.21.3

- **`xgboost`**: 1.0.1

- **`catboost`**: 0.21

- **`lightgbm`**: 2.3.1

# Import Module

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Modeling
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import xgboost as xgb

# Metric, Kfold
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold

# Weight Ensemble
from scipy.optimize import minimize

import time
import warnings 
warnings.filterwarnings('ignore')

# Read Data

In [2]:
train = pd.read_csv('./data/train.csv', index_col='id')
test = pd.read_csv('./data/test.csv', index_col='id')

submission = pd.read_csv('./data/sample_submission.csv')

# Feature Engineering

In [3]:
def SpectralClass(x):
    if x < -0.3:
        return "O"
    
    elif (x >= -0.3) & (x < 0):
        return "B"
    
    elif (x >= 0) & (x < 0.33):
        return "A"
    
    elif (x >= 0.33) & (x < 0.6):
        return "F"
    
    elif (x >= 0.6) & (x < 0.81):
        return "G"
    
    elif (x >= 0.81) & (x < 1.4):
        return "K"
    
    else:
        return "M"

In [5]:
def ugriz(df):
    mag = ['psfMag', 'fiberMag', 'PetroMag', 'model']
    colors = list('ugriz')
    
    u = list(df.columns[df.columns.str.endswith('_u')])
    g = list(df.columns[df.columns.str.endswith('_g')])
    r = list(df.columns[df.columns.str.endswith('_r')])
    i = list(df.columns[df.columns.str.endswith('_i')])
    z = list(df.columns[df.columns.str.endswith('_z')])
    
    # u_g
    for idx, cols in enumerate(mag):
        df[cols+'_u_g'] = df[u[idx]] - df[g[idx]]
     
    # g_r
    for idx, cols in enumerate(mag):
        df[cols+'_g_r'] = df[g[idx]] - df[r[idx]]
        
    # r_i
    for idx, cols in enumerate(mag):
        df[cols+'_r_i'] = df[r[idx]] - df[i[idx]]
        
    # g_i
    for idx, cols in enumerate(mag):
        df[cols+'_g_i'] = df[g[idx]] - df[i[idx]] 
        
    # i_z
    for idx, cols in enumerate(mag):
        df[cols+'_i_z'] = df[i[idx]] - df[z[idx]]
    
    # model-[psf,fiber,petro]
    # fiber-[psf]
    for color in colors:
        df['model_psf_'+ color] = df[locals()[color][3]] - df[locals()[color][0]]
        df['model_fiber_'+ color] = df[locals()[color][3]] - df[locals()[color][1]]
        df['model_petro_'+ color] = df[locals()[color][3]] - df[locals()[color][2]]
        df['fiber_psf_' + color] = df[locals()[color][1]] - df[locals()[color][0]]
        
    # B_V
    for idx, cols in enumerate(mag):
        df[cols+'_b_v'] = 0.98 * (df[g[idx]] - df[r[idx]]) + 0.22
        df['star_spectrum_'+cols] = df[cols+'_b_v'].apply(lambda x: SpectralClass(x))
        
    return df

In [7]:
train = ugriz(train)
test = ugriz(test)

In [6]:
bins= [0, 100, 200, 300, 400, 500, 600, 640, 1000]
labels = list('01234567')

train['fiberID2'] = pd.cut(train['fiberID'], bins= bins, labels = labels)
test['fiberID2'] = pd.cut(test['fiberID'], bins= bins, labels = labels)

In [8]:
# One-Hot Encoding
star = list(train.columns[train.columns.str.startswith('star_')])
star.append('fiberID2')

train= pd.get_dummies(data=train, columns=star)
test = pd.get_dummies(data=test, columns=star)

In [9]:
type_list  = list(submission.columns)[1:]
type_dict = {w: i for i, w in enumerate(type_list)}

train['type'] = train['type'].apply(lambda x: type_dict[x])

# Modeling

In [10]:
features = [c for c in train.columns if c not in ['type']]
target = train['type']

## LIGHTGBM

In [13]:
param = {
        'num_leaves': 10,
        'num_class': 19,
        'learning_rate': 0.03,
        'bagging_fraction': 0.7, 
        'feature_fraction': 0.7,
        'max_depth': 8,
        'seed': 1337,
        'lambda_l1': 4.972,
        'lambda_l2': 2.276,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'objective': 'multiclass',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'multi_logloss',
        'is_unbalance': True,
        'boost_from_average': False,
    }

In [14]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_lgb = np.zeros((len(train),19))
lgb_pred = np.zeros((len(test),19))

start = time.time()

for fold_, (trn_idx, val_idx) in enumerate(skf.split(train, train['type'])):
    print("fold num_: {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])
    
    
    num_round = 5000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)    
    lgb_pred += clf.predict(test[features], num_iteration=clf.best_iteration) / 5
    
print('\nCross Validation Is Complete')                           
print("CV score: {:<8.5f}".format(log_loss(target, oof_lgb)))

fold num_: 0
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.526307	valid_1's multi_logloss: 0.533403
[200]	training's multi_logloss: 0.390119	valid_1's multi_logloss: 0.403291
[300]	training's multi_logloss: 0.356096	valid_1's multi_logloss: 0.375412
[400]	training's multi_logloss: 0.338911	valid_1's multi_logloss: 0.364146
[500]	training's multi_logloss: 0.326834	valid_1's multi_logloss: 0.357606
[600]	training's multi_logloss: 0.311941	valid_1's multi_logloss: 0.348292
[700]	training's multi_logloss: 0.298786	valid_1's multi_logloss: 0.340521
[800]	training's multi_logloss: 0.290075	valid_1's multi_logloss: 0.33688
[900]	training's multi_logloss: 0.283773	valid_1's multi_logloss: 0.33546
[1000]	training's multi_logloss: 0.278325	valid_1's multi_logloss: 0.334713
[1100]	training's multi_logloss: 0.273359	valid_1's multi_logloss: 0.334239
[1200]	training's multi_logloss: 0.268739	valid_1's multi_logloss: 0.333829
[1300]	training's multi_

Early stopping, best iteration is:
[1979]	training's multi_logloss: 0.237514	valid_1's multi_logloss: 0.338948

Cross Validation Is Complete
CV score: 0.33566 
Wall time: 21min 5s


## CATBOOST

In [15]:
model = CatBoostClassifier(loss_function='MultiClass', 
                           early_stopping_rounds=50,
                           random_state=42,
                           task_type="GPU",
                           learning_rate=0.03,
                           iterations=5000)

skf = StratifiedKFold(n_splits=5, random_state=74, shuffle=True)


oof_cat = np.zeros((len(train),19))
cat_pred = np.zeros((len(test),19))


for idx, (train_index, valid_index) in enumerate(skf.split(train, train['type'])):
    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
    X_train, X_valid = train[features].iloc[train_index,:], train[features].iloc[valid_index,:]
    _train = Pool(X_train, label=y_train)
    _valid = Pool(X_valid, label=y_valid)
    print( "\nFold ", idx)
    fit_model = model.fit(_train,
                          eval_set=_valid,
                          use_best_model=True,
                          verbose=500
                         )
    pred = fit_model.predict_proba(X_valid)
    print( "  Log loss = ", log_loss(y_valid, pred) )
    oof_cat[valid_index] = pred
    cat_pred += fit_model.predict_proba(test[features])
cat_pred /= 5


Fold  0
0:	learn: 2.6317236	test: 2.6329529	best: 2.6329529 (0)	total: 27.6ms	remaining: 6m 54s
500:	learn: 0.3561276	test: 0.3736210	best: 0.3736210 (500)	total: 12s	remaining: 5m 46s
1000:	learn: 0.3211845	test: 0.3529838	best: 0.3529838 (1000)	total: 23.7s	remaining: 5m 31s
1500:	learn: 0.3001953	test: 0.3457730	best: 0.3457730 (1500)	total: 35.9s	remaining: 5m 22s
2000:	learn: 0.2835189	test: 0.3422781	best: 0.3422781 (2000)	total: 47.6s	remaining: 5m 9s
2500:	learn: 0.2688179	test: 0.3405325	best: 0.3405287 (2499)	total: 59.3s	remaining: 4m 56s
3000:	learn: 0.2560304	test: 0.3394969	best: 0.3394969 (3000)	total: 1m 11s	remaining: 4m 45s
3500:	learn: 0.2439809	test: 0.3389169	best: 0.3389114 (3497)	total: 1m 23s	remaining: 4m 33s
bestTest = 0.3387334118
bestIteration = 3742
Shrink model to first 3743 iterations.
  Log loss =  0.33873330188604067

Fold  1
0:	learn: 2.6321060	test: 2.6317583	best: 2.6317583 (0)	total: 26ms	remaining: 6m 29s
500:	learn: 0.3587448	test: 0.3670907	best

## XGBOOST

In [11]:
xgb_params={'eta':0.03,
            'max_depth':6,
            'objective':'multi:softprob',
            'alpha' : 4.972,
            'lambda' : 2.276,
            'num_class':19,
            'subsample':0.7,
            'colsample_bytree':0.7,
            'random_state':42,
            'eval_metric': 'mlogloss',
            'tree_method':'gpu_hist',
            'predictor':'gpu_predictor'}

In [12]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_xgb = np.zeros((len(train),19))
xgb_pred = np.zeros((len(test),19))

start = time.time()

for fold_, (trn_idx, val_idx) in enumerate(skf.split(train, train['type'])):
    print("fold num_: {}".format(fold_))
    trn_data = xgb.DMatrix(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = xgb.DMatrix(train.iloc[val_idx][features], label=target.iloc[val_idx])
    
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
    num_round = 5000
    clf = xgb.train(params = xgb_params,
                    dtrain = trn_data,
                    num_boost_round  = num_round,
                    evals = watchlist,
                    verbose_eval=100,
                    early_stopping_rounds = 100
                )
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(train.iloc[val_idx][features]), ntree_limit=clf.best_iteration)

    
    xgb_pred += clf.predict(xgb.DMatrix(test[features]), ntree_limit=clf.best_iteration) / 5
    
print('\nCross Validation Is Complete')                           
print("CV score: {:<8.5f}".format(log_loss(target, oof_xgb)))

fold num_: 0
[0]	train-mlogloss:2.73067	valid-mlogloss:2.73121
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.45262	valid-mlogloss:0.47024
[200]	train-mlogloss:0.33252	valid-mlogloss:0.36065
[300]	train-mlogloss:0.30203	valid-mlogloss:0.34170
[400]	train-mlogloss:0.28354	valid-mlogloss:0.33544
[500]	train-mlogloss:0.26718	valid-mlogloss:0.33248
[600]	train-mlogloss:0.25220	valid-mlogloss:0.33083
[700]	train-mlogloss:0.23857	valid-mlogloss:0.32984
[800]	train-mlogloss:0.22607	valid-mlogloss:0.32919
[900]	train-mlogloss:0.21434	valid-mlogloss:0.32900
[1000]	train-mlogloss:0.20365	valid-mlogloss:0.32886
[1100]	train-mlogloss:0.19377	valid-mlogloss:0.32903
Stopping. Best iteration:
[1020]	train-mlogloss:0.20166	valid-mlogloss:0.32885

fold num_: 1
[0]	train-mlogloss:2.73041	valid-mlogloss:2.73102
Multiple eval metrics have been passed: 'valid-mlogloss' will be u

# Weight Ensemble

In [16]:
predictions = [oof_xgb, oof_lgb, oof_cat]

def log_loss_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

    return log_loss(train['type'], final_prediction)
    
#the algorithms need a starting value, right not we chose 0.5 for all weights
#its better to choose many random starting points and run minimize a few times
starting_values = [0.5]*len(predictions)

#adding constraints  and a different solver as suggested by user 16universe
#https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)

res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))

Ensamble Score: 0.32772569398255863
Best Weights: [0.57025908 0.03675345 0.39298747]


In [18]:
weight_result = [0.57025908, 0.03675345, 0.39298747]

final = xgb_pred*weight_result[0] + lgb_pred*weight_result[1] + cat_pred*weight_result[2]

# Submit

In [19]:
final = pd.DataFrame(data=final, columns=submission.columns[1:], index=submission.id).reset_index()
final.to_csv('final_victory.csv', index=False)