## The purpose of this kernel is to bring together features
- The first 69 are from the 1.080 kernel which came via Oliver, Iprapas, and Chia-ta Tsai
- Our Smote brought this to 1.052, implementing Scirpus 99 method brought to 1.039
- Adding seven (7) features from our PB25 model brought to 1.030
- Changing K from 5 to 12 brough to 1.029 (not recommended during development)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir('../input'))
print(os.listdir("../input/writefeaturetablefromsmotedartset"))
print(os.listdir('../input/fork-of-aggregatecustomfeaturestest'))
# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import itertools
import pickle, gzip
import glob
from sklearn.preprocessing import StandardScaler
#from tsfresh.feature_extraction import extract_features
np.warnings.filterwarnings('ignore')


## The next cells will be a bunch of functions brought in mostly from Chia-Ta Tsai's kernel
- And he references Iprapas, Oliver, Kyle Boone, Giba
- The Kaggle community shares a lot with one another
- It has been challenging integrating their code with our functions
- And it has forced us to learn / understand their code (to some degree)

In [None]:
"""

This script is forked from chia-ta tsai's kernel of which he said:

This script is forked from iprapas's notebook 
https://www.kaggle.com/iprapas/ideas-from-kernels-and-discussion-lb-1-135

#    https://www.kaggle.com/ogrellier/plasticc-in-a-kernel-meta-and-data
#    https://www.kaggle.com/c/PLAsTiCC-2018/discussion/70908
#    https://www.kaggle.com/meaninglesslives/simple-neural-net-for-time-series-classification
#
"""

import sys, os
import argparse
import time
from datetime import datetime as dt
import gc; gc.enable()
from functools import partial, wraps

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
np.warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from tsfresh.feature_extraction import extract_features
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:

def multi_weighted_logloss(y_true, y_preds, classes, class_weights):
    """
    refactor from
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weights[k] for k in sorted(class_weights.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss

In [None]:
def lgbm_multi_weighted_logloss(y_true, y_preds):
    """
    refactor from
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """  
    # Taken from Giba's topic : https://www.kaggle.com/titericz
    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
    # with Kyle Boone's post https://www.kaggle.com/kyleboone
    classes = [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
    class_weights = {6: 1, 15: 2, 16: 1, 42: 1, 52: 1, 53: 1, 62: 1, 64: 2, 65: 1, 67: 1, 88: 1, 90: 1, 92: 1, 95: 1}

    loss = multi_weighted_logloss(y_true, y_preds, classes, class_weights)
    return 'wloss', loss, False


def xgb_multi_weighted_logloss(y_predicted, y_true, classes, class_weights):
    loss = multi_weighted_logloss(y_true.get_label(), y_predicted, 
                                  classes, class_weights)
    return 'wloss', loss

In [None]:

def save_importances_archive(importances_):
    mean_gain = importances_[['gain', 'feature']].groupby('feature').mean()
    importances_['mean_gain'] = importances_['feature'].map(mean_gain['gain'])
    return importances_

def save_importances(importances_):
    mean_gain = importances_[['gain', 'feature']].groupby('feature').mean()
    importances_['mean_gain'] = importances_['feature'].map(mean_gain['gain'])
    plt.figure(figsize=(8, 12))
    sns.barplot(x='gain', y='feature', data=importances_.sort_values('mean_gain', ascending=False))
    plt.tight_layout()
    plt.savefig('importances.png')
    return importances_

In [None]:
def xgb_modeling_cross_validation(params,
                                  full_train, 
                                  y, 
                                  classes, 
                                  class_weights, 
                                  nr_fold=7, 
                                  random_state=1):
    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}

    # loss function
    func_loss = partial(xgb_multi_weighted_logloss, 
                        classes=classes, 
                        class_weights=class_weights)

    clfs = []
    importances = pd.DataFrame()
    folds = StratifiedKFold(n_splits=nr_fold, 
                            shuffle=True, 
                            random_state=random_state)
    
    oof_preds = np.zeros((len(full_train), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_]
        val_x, val_y = full_train.iloc[val_], y.iloc[val_]
    
        clf = XGBClassifier(**params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=func_loss,
            verbose=100,
            early_stopping_rounds=77,
            sample_weight=trn_y.map(weights)
        )
        clfs.append(clf)

        oof_preds[val_, :] = clf.predict_proba(val_x, ntree_limit=clf.best_ntree_limit)
        print('no {}-fold loss: {}'.format(fold_ + 1, 
              multi_weighted_logloss(val_y, oof_preds[val_, :], 
                                     classes, class_weights)))
    
        imp_df = pd.DataFrame({
                'feature': full_train.columns,
                'gain': clf.feature_importances_,
                'fold': [fold_ + 1] * len(full_train.columns),
                })
        importances = pd.concat([importances, imp_df], axis=0, sort=False)

    score = multi_weighted_logloss(y_true=y, y_preds=oof_preds, 
                                   classes=classes, class_weights=class_weights)
    print('MULTI WEIGHTED LOG LOSS: {:.5f}'.format(score))
    df_importances = save_importances(importances_=importances)
    df_importances.to_csv('xgb_importances.csv', index=False)
    
    return clfs, score

## This method is my main contribution

In [None]:

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np # linear algebra
import pandas as pd

#modify to work with kfold
#def smoteAdataset(Xig, yig, test_size=0.2, random_state=0):
def smoteAdataset(Xig_train, yig_train, Xig_test, yig_test):
    
        
    sm=SMOTE(random_state=2)
    Xig_train_res, yig_train_res = sm.fit_sample(Xig_train, yig_train.ravel())

        
    return Xig_train_res, pd.Series(yig_train_res), Xig_test, pd.Series(yig_test)

## This is Oliver and Iprapas method but I integrated my Smote method into it

In [None]:

def lgbm_modeling_cross_validation(params,
                                   full_train, 
                                   y, 
                                   classes, 
                                   class_weights, 
                                   nr_fold=7, 
                                   random_state=1):

    # Compute weights
    w = y.value_counts()
    weights = {i : np.sum(w) / w[i] for i in w.index}
   # print(weights)
   # weights=class_weights
    clfs = []
    importances = pd.DataFrame()
    folds = StratifiedKFold(n_splits=nr_fold, 
                            shuffle=True, 
                            random_state=random_state)
    
    oof_preds = np.zeros((len(full_train), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_]
        val_x, val_y = full_train.iloc[val_], y.iloc[val_]
        
                
        trn_xa, trn_y, val_xa, val_y=smoteAdataset(trn_x.values, trn_y.values, val_x.values, val_y.values)
        trn_x=pd.DataFrame(data=trn_xa, columns=trn_x.columns)
    
        val_x=pd.DataFrame(data=val_xa, columns=val_x.columns)
        
        clf = LGBMClassifier(**params)
        clf.fit(
            trn_x, trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric=lgbm_multi_weighted_logloss,
            verbose=100,
            early_stopping_rounds=77,
            sample_weight=trn_y.map(weights)
        )
        clfs.append(clf)

        oof_preds[val_, :] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
        print('no {}-fold loss: {}'.format(fold_ + 1, 
              multi_weighted_logloss(val_y, oof_preds[val_, :], 
                                     classes, class_weights)))
    
        imp_df = pd.DataFrame({
                'feature': full_train.columns,
                'gain': clf.feature_importances_,
                'fold': [fold_ + 1] * len(full_train.columns),
                })
        importances = pd.concat([importances, imp_df], axis=0, sort=False)

    score = multi_weighted_logloss(y_true=y, y_preds=oof_preds, 
                                   classes=classes, class_weights=class_weights)
    print('MULTI WEIGHTED LOG LOSS: {:.5f}'.format(score))
    df_importances = save_importances(importances_=importances)
    df_importances.to_csv('lgbm_importances.csv', index=False)
    
    return clfs, score, oof_preds, importances

## I made modifications to these methods (predict_chunk, process_test)
- Separating feature extraction from modeling potentially allows us to do more within the 6 hour kernel window
- It allows us to generate features on parallel paths
- The chunk processing still seemed necessary as the kernel timed out without it

In [None]:

def predict_chunk(df_, clfs_, features, train_mean):
    # Group by object id    
    agg_ = df_
    # Merge with meta data
    full_test = agg_.reset_index()
    print(full_test.head())

    full_test = full_test.fillna(0)
    # Make predictions
    preds_ = None
    for clf in clfs_:
        if preds_ is None:
            preds_ = clf.predict_proba(full_test[features]) / len(clfs_)
        else:
            preds_ += clf.predict_proba(full_test[features]) / len(clfs_)
            
    #going to recalc 99 below anyways
    # Compute preds_99 as the proba of class not being any of the others
    # preds_99 = 0.1 gives 1.769
    preds_99 = np.ones(preds_.shape[0])
    
    
    for i in range(preds_.shape[1]):
        preds_99 *= (1 - preds_[:, i])

    # Create DataFrame from predictions
    preds_df_ = pd.DataFrame(preds_, columns=['class_' + str(s) for s in clfs_[0].classes_])
    preds_df_['object_id'] = full_test['object_id']
    preds_df_['class_99'] = 0.14 * preds_99 / np.mean(preds_99) 

    return preds_df_

## The filenaming here is counter-intuitive
- I ran into memory issues and the approach I took became necessary (or at least practical)
- I write testdf to a file that will later be overwritten by the predictions
- During the loop the features are being read from filename (which is like subm%)
- During the loop the predictions are being written to predictions.csv
- At the end the prediction dataFrame overwrites the featureSet file

In [None]:
def process_test(clfs, 
                 testdf,
                 full_train,
                 train_mean,
                 filename='submission.csv',
                 chunks=40615):

    import time
    
    #choose a value for chunks such that testdf.shape[0]%chunks=0 - this saves you headache at the end
    start = time.time()
    chunks = 40615
    # df=df.round(8)
    testdf=testdf.round(4)
    testdf.to_csv(filename, index=False)
    for i_c, df in enumerate(pd.read_csv(filename, chunksize=chunks, iterator=True)):

        print(df.shape)
        preds_df = predict_chunk(df_=df,
                                 clfs_=clfs,
                                 features=full_train.columns,
                                 train_mean=train_mean)

        if i_c == 0:
            preds_df.to_csv('predictions.csv', header=True, mode='a', index=False)
        else:
            preds_df.to_csv('predictions.csv', header=False, mode='a', index=False)

        del preds_df
        gc.collect()

        print('%15d done in %5.1f minutes' % (chunks * (i_c + 1), (time.time() - start) / 60), flush=True)

    return

In [None]:
#from Scirpus discussion:

def GenUnknown(data):
    return ((((((data["mymedian"]) + (((data["mymean"]) / 2.0)))/2.0)) + (((((1.0) - (((data["mymax"]) * (((data["mymax"]) * (data["mymax"]))))))) / 2.0)))/2.0)



In [None]:
best_params = {
            'device': 'cpu', 
            'objective': 'multiclass', 
            'num_class': 14, 
            'boosting_type': 'gbdt', 
            'n_jobs': -1, 
            'max_depth': 7, 
            'n_estimators': 1000, #was 500
            'subsample_freq': 2, 
            'subsample_for_bin': 5000, 
            'min_data_per_group': 200, #was 100
            'max_cat_to_onehot': 4, 
            'cat_l2': 1.0, 
            'cat_smooth': 59.5, 
            'max_cat_threshold': 32, 
            'metric_freq': 10, 
            'verbosity': -1, 
            'metric': 'multi_logloss', 
            'xgboost_dart_mode': False, 
            'uniform_drop': False, 
            'colsample_bytree': 0.5, 
            'drop_rate': 0.173, 
            'learning_rate': 0.0133, #was .267
            'max_drop': 5, 
            'min_child_samples': 20, #was 10
            'min_child_weight': 100.0, 
            'min_split_gain': 0.1, 
            'num_leaves': 7, 
            'reg_alpha': 0.1, 
            'reg_lambda': 0.00023, 
            'skip_drop': 0.44, 
            'subsample': 0.75}


## The cells below are adapted from Chai-Ta Tsai's 'main' method
- I made changes to enable usage of my features
- If you add features, make sure you add the same features to both testdf and full_train
- Be aware of memory issues.  It may be necessary to delete some sets once they've been used.  

In [None]:
#Here is a change from the script
#training features
trainingDartDf=pd.read_csv('../input/writefeaturetablefromsmotedartset/trainingFeatures1039.csv')
trainingJimsDf=pd.read_csv('../input/aggregatecustomfeatures/trainingAggregate.csv')
if 'Unnamed: 0' in trainingDartDf.columns:
    trainingDartDf=trainingDartDf.drop('Unnamed: 0', axis=1)
print(trainingDartDf.shape)


In [None]:

#trainingJimsDf=aggForTwoPb(trainingJimsDf, 2, 5)
print(trainingJimsDf.shape)
trainingJimsDf.head()

In [None]:
trainingJimsDf.columns

In [None]:
#trainingDartDf.head()
columnsToAdd=['pb0maxstd', 'pb0medmed', 'pb0maxavg', 'pb0maxdsl',
       'pb3maxstd', 'pb3medmed', 'pb3maxavg', 'pb3maxdsl', 'pb3transitory',
       'pb0transitory', 'pb1maxstd', 'pb1medmed', 'pb1maxavg', 'pb1maxdsl',
       'pb4maxstd', 'pb4medmed', 'pb4maxavg', 'pb4maxdsl', 'pb4transitory',
       'pb1transitory', 'pb2maxstd', 'pb2medmed', 'pb2maxavg', 'pb2maxdsl',
       'pb5maxstd', 'pb5medmed', 'pb5maxavg', 'pb5maxdsl', 'pb5transitory',
       'pb2transitory', 'irmaxstd', 'vismaxstd', 'uvmaxstd', 'irmedmed',
       'vismedmed', 'uvmedmed', 'irmaxavg', 'vismaxavg', 'uvmaxavg',
       'irmaxdsl', 'vismaxdsl', 'uvmaxdsl', 'irtransitory', 'vistransitory',
       'uvtransitory', 'irSpread', 'uvSpread', 'visSpread', 'irUvSpreadRatio',
       'irMinusUvTransitory', 'irVisSpreadRatio', 'irMinusVisTransitory']

for column in columnsToAdd:
    trainingDartDf.loc[:,column]=trainingJimsDf.loc[:,column]

traindf=trainingDartDf

#from the 1.052 kernel
del traindf['hostgal_specz']
del traindf['ra'], traindf['decl'], traindf['gal_l'], traindf['gal_b']
del traindf['ddf']

#these appear to either be properties of the measurement system or collinear with the newly added features
#del traindf['flux_err_mean'], traindf['flux_err_median'],traindf['flux_err_std'],traindf['flux_err_max'], traindf['flux_err_min']
#del traindf['flux_mean'], traindf['flux_std'], traindf['flux_max']


print(traindf.shape)
traindf.head()

In [None]:
    #test features
    testDartDf=pd.read_csv('../input/writefeaturetablefromsmotedartset/feat_0.648970_2018-11-23-09-00.csv')
    testJimsDf=pd.read_csv('../input/fork-of-aggregatecustomfeaturestest/testAggregate.csv')

    if 'Unnamed: 0' in testDartDf.columns:
        testDartDf=testDartDf.drop('Unnamed: 0', axis=1)
    print(testDartDf.shape)
    testDartDf.head()

    for column in columnsToAdd:
        testDartDf[column]=testJimsDf[column]

    testdf=testDartDf

    #from the 1.052 kernel
    del testdf['hostgal_specz']
    del testdf['ra'], testdf['decl'], testdf['gal_l'], testdf['gal_b']
    del testdf['ddf']

    testdf.shape

In [None]:
full_train=traindf
if 'target' in full_train:
    y = full_train['target']
    del full_train['target']

classes = sorted(y.unique())    
# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weights = {c: 1 for c in classes}
class_weights.update({c:2 for c in [64, 15]})
print('Unique classes : {}, {}'.format(len(classes), classes))
print(class_weights)


In [None]:

if 'object_id' in full_train:
    oof_df = full_train[['object_id']]
    del full_train['object_id'] 
    #del full_train['distmod'] 

train_mean = full_train.mean(axis=0)
#train_mean.to_hdf('train_data.hdf5', 'data')
pd.set_option('display.max_rows', 500)
#print(full_train.describe().T)
#import pdb; pdb.set_trace()
full_train.fillna(0, inplace=True)
print(full_train.shape)
full_train.head()


In [None]:
print(class_weights)
print(classes)
print(y.shape)
print(full_train.shape)

## The first two lines (or lack thereof) have caused me more headache than I can count
- it has to do with numpy data types when native data types are expected

## CV score (multi-weighted log loss at the end of the output) is a good predictor of LB score
- Changing the number of folds improves CV a lot without much impact on LB, so leave it at 5
- Things that are likely to induce overfitting will increase the delta between CV and LB
- Right now the mapping we've seen with the surrounding code is 0.7 --> 1.110, 0.649 --> 1.039, 0.638 --> 1.030
- Our latest estimate is LB = CV + 0.392
- baseline with first seven features is .638

In [None]:
for cindex in full_train.columns:
    full_train.loc[:,cindex]=np.float64(full_train.loc[:,cindex])

eval_func = partial(lgbm_modeling_cross_validation, 
                        full_train=full_train, 
                        y=y, 
                        classes=classes, 
                        class_weights=class_weights, 
                        nr_fold=5, 
                        random_state=1)

best_params.update({'n_estimators': 2000}) #was 1000
    
    # modeling from CV
clfs, score, oof_preds, importances = eval_func(best_params)

In [None]:
from matplotlib import pyplot as plt
save_importances(importances_=importances)
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
unique_y = np.unique(y)
class_map = dict()
for i,val in enumerate(unique_y):
    class_map[val] = i
        
y_map = np.zeros((y.shape[0],))
y_map = np.array([class_map[val] for val in y])

# Compute confusion matrix
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y_map, np.argmax(oof_preds,axis=-1))
np.set_printoptions(precision=2)

sample_sub = pd.read_csv('../input/PLAsTiCC-2018/sample_submission.csv')
class_names = list(sample_sub.columns[1:-1])
del sample_sub;gc.collect()

# Plot non-normalized confusion matrix
plt.figure(figsize=(12,12))
foo = plot_confusion_matrix(cnf_matrix, classes=class_names,normalize=True,
                      title='Confusion matrix')

In [None]:

filename = 'subm_{:.6f}_{}.csv'.format(score, 
                 dt.now().strftime('%Y-%m-%d-%H-%M'))
print('save to {}'.format(filename))
# TEST


process_test(clfs, 
             testdf,
             full_train,
             train_mean=train_mean, 
             filename=filename,
             chunks=40615)


pdf = pd.read_csv('predictions.csv')

In [None]:



# get a list of columns
cols = list(pdf)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('object_id')))
pdf = pdf.loc[:, cols]



feats = ['class_6', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53',
         'class_62', 'class_64', 'class_65', 'class_67', 'class_88', 'class_90',
         'class_92', 'class_95']

y = pd.DataFrame()
y['mymean'] = pdf[feats].mean(axis=1)
y['mymedian'] = pdf[feats].median(axis=1)
y['mymax'] = pdf[feats].max(axis=1)

pdf['class_99'] = GenUnknown(y)

pdf.to_csv(filename, index=False)

