In [1]:

# Import libraries
from __future__ import unicode_literals
from __future__ import division
import sys

import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split

from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import StandardScaler

def logloss_mc(y_true, y_prob, epsilon=1e-15): #    Multiclass logloss
    
    # normalize
    y_prob = y_prob / y_prob.sum(axis=1).reshape(-1, 1)
    y_prob = np.maximum(epsilon, y_prob)
    y_prob = np.minimum(1 - epsilon, y_prob)
    # get probabilities
    y = [y_prob[i, j] for (i, j) in enumerate(y_true)]
    ll = - np.mean(np.log(y))
    return ll


if __name__ == '__main__':

    print(" - Start.")
    
    testFile="task\\task\\testData.csv"
    trainFile="task\\task\\trainData.csv"
    outFile="solution_swc.csv"

    #read train data
    print(" - Read Train Data.")
    df_train = pd.read_csv(trainFile)
    
    #df_train.reset_index(level=0, inplace=True)
    
    X_TRN = df_train.values.copy()
    np.savetxt(r"c1.csv",X_TRN,delimiter=',')
    np.random.shuffle(X_TRN)
    np.savetxt(r"c2.csv",X_TRN,delimiter=',')
    np.savetxt(r"c3.csv",X_TRN[:, :-1],delimiter=',')
    np.savetxt(r"c4.csv",X_TRN[:, -1],delimiter=',')
    
    #standardize
    X_std = StandardScaler().fit_transform(X_TRN[:, :-1])
    
    
    #make validation set
    print(" - Make Validation Set.")    
    train_size=0.8
    X_train, X_valid, y_train, y_valid = train_test_split(X_std, X_TRN[:, -1], train_size=train_size,)

    #read test data
    print(" - Read Test Data.")     
    df_test = pd.read_csv(testFile)
    #df_test.reset_index(level=0, inplace=True)
    X_t = df_test.values.copy()
    X_TST = StandardScaler().fit_transform(X_t)



 - Start.
 - Read Train Data.
 - Make Validation Set.
 - Read Test Data.


In [2]:
###########################modeling###########################

#xgboost

print(" - Running xgboost.")
param = {}
param['eval_metric'] = 'mlogloss'
param['objective'] = 'reg:linear'
param['subsample'] = 0.8
param['colsample_bytree'] = 0.8
param['silent'] = 1
param['max_depth'] = 7
param['n_estimators']=100
    
dtrain = xgb.DMatrix(X_train, label = y_train)  
my_model = xgb.XGBClassifier(**param)

clf = my_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mlogloss') 


 - Running xgboost.
[0]	validation_0-mlogloss:1.95339	validation_1-mlogloss:1.95731
[1]	validation_0-mlogloss:1.78367	validation_1-mlogloss:1.79242
[2]	validation_0-mlogloss:1.64394	validation_1-mlogloss:1.65693
[3]	validation_0-mlogloss:1.53493	validation_1-mlogloss:1.55074
[4]	validation_0-mlogloss:1.44108	validation_1-mlogloss:1.46042
[5]	validation_0-mlogloss:1.35712	validation_1-mlogloss:1.37909
[6]	validation_0-mlogloss:1.28639	validation_1-mlogloss:1.31055
[7]	validation_0-mlogloss:1.22572	validation_1-mlogloss:1.25228
[8]	validation_0-mlogloss:1.16792	validation_1-mlogloss:1.19689
[9]	validation_0-mlogloss:1.11727	validation_1-mlogloss:1.14837
[10]	validation_0-mlogloss:1.06902	validation_1-mlogloss:1.10262
[11]	validation_0-mlogloss:1.02601	validation_1-mlogloss:1.06191
[12]	validation_0-mlogloss:0.986953	validation_1-mlogloss:1.02495
[13]	validation_0-mlogloss:0.952257	validation_1-mlogloss:0.992293
[14]	validation_0-mlogloss:0.920408	validation_1-mlogloss:0.962217
[15]	valid

In [3]:
evals_result = clf.evals_result()
print (np.argmax(evals_result['validation_1']['mlogloss']))

my_model = my_model.set_params(n_estimators = np.argmax(evals_result['validation_1']['mlogloss']))

y_pred = my_model.predict_proba(X_valid)

encoder = LabelEncoder()
y_true = encoder.fit_transform(y_valid)
score = logloss_mc(y_true, y_pred)
print(" -- Multiclass logloss on xgboost validation set: {:.4f}.".format(score))


0
 -- Multiclass logloss on xgboost validation set: 0.5187.


In [4]:
###########################result calculation###########################

test_predictions = clf.predict_proba(X_TST)

#output result
output = pd.DataFrame(test_predictions)

output.columns = ['c1', 'c2', 'c3', 'c4','c5','c6','c7','c8','c9']
output.to_csv('SK_submission_xgb.csv', index = False)

print(" - XGB Finished.")

 - XGB Finished.


In [5]:
#lightgbm
#try lightgbm

print (" - Preparing lightgbm")

import lightgbm as lgb
from sklearn.model_selection import KFold
def fit_predict(data, y, test, test_sub):
    dtrain = lgb.Dataset(data=data, label=y, free_raw_data=False)
    dtrain.construct()
    
    #oof_preds = np.zeros((data.shape[0],9))
    test_preds = np.zeros((test.shape[0],9))
    test_sub_preds = np.zeros((test_sub.shape[0],9))
    
    lgb_params = {
        "objective" : "multiclass",
        "num_class" : 9,
        "metric" : "multi_logloss",
        "num_leaves": 5,
        "min_data_in_leaf": 5,
        "learning_rate": 0.01,

        "feature_fraction": 1,
        "feature_fraction_seed": 2,

   #     "bagging_fraction": 0.8,
   #     "bagging_freq" : 10,
   #     "bagging_seed" : 42, #2018

        "verbosity" : 1,
#         'lambda_l1' : 10,
#         'lambda_l2' : 10,
#        'max_bin' : 50
    }

    folds = KFold(n_splits=2, shuffle=True, random_state=2)

    counter = 1
    for trn_idx, val_idx in folds.split(data):
        print('----------------------------')
        print('Fold: %d' % counter)

        trn_d = dtrain.subset(trn_idx)
        val_d = dtrain.subset(val_idx)

        clf = lgb.train(
            params=lgb_params,
            train_set=trn_d,
            valid_sets=[trn_d, val_d],
            num_boost_round=5000,
            early_stopping_rounds=50,
            verbose_eval=50
        )


        #oof_preds[val_idx] = clf.predict(dtrain.data[val_idx, :])
        test_preds += clf.predict(test) / folds.n_splits
        test_sub_preds += clf.predict(test_sub) / folds.n_splits
        
        counter += 1

    return test_preds , test_sub_preds


 - Preparing lightgbm


In [6]:
y_train_class_0_8 = y_train-np.ones(y_train.shape)
print(" - LightGBM is Running.")
y_predict, test_predict = fit_predict(X_train, y_train_class_0_8, X_valid, X_TST)
print(" - LightGBM Finished.")

 - LightGBM is Running.
----------------------------
Fold: 1
Training until validation scores don't improve for 50 rounds.
[50]	training's multi_logloss: 1.62801	valid_1's multi_logloss: 1.63036
[100]	training's multi_logloss: 1.35386	valid_1's multi_logloss: 1.35907
[150]	training's multi_logloss: 1.18521	valid_1's multi_logloss: 1.19207
[200]	training's multi_logloss: 1.07044	valid_1's multi_logloss: 1.079
[250]	training's multi_logloss: 0.985587	valid_1's multi_logloss: 0.995872
[300]	training's multi_logloss: 0.923244	valid_1's multi_logloss: 0.934945
[350]	training's multi_logloss: 0.875759	valid_1's multi_logloss: 0.888697
[400]	training's multi_logloss: 0.837948	valid_1's multi_logloss: 0.851899
[450]	training's multi_logloss: 0.807021	valid_1's multi_logloss: 0.821928
[500]	training's multi_logloss: 0.781095	valid_1's multi_logloss: 0.796949
[550]	training's multi_logloss: 0.75908	valid_1's multi_logloss: 0.776008
[600]	training's multi_logloss: 0.740001	valid_1's multi_logloss

[250]	training's multi_logloss: 0.981559	valid_1's multi_logloss: 0.991138
[300]	training's multi_logloss: 0.920085	valid_1's multi_logloss: 0.930947
[350]	training's multi_logloss: 0.872609	valid_1's multi_logloss: 0.884998
[400]	training's multi_logloss: 0.834685	valid_1's multi_logloss: 0.848692
[450]	training's multi_logloss: 0.803574	valid_1's multi_logloss: 0.81922
[500]	training's multi_logloss: 0.777681	valid_1's multi_logloss: 0.794821
[550]	training's multi_logloss: 0.75569	valid_1's multi_logloss: 0.774216
[600]	training's multi_logloss: 0.736464	valid_1's multi_logloss: 0.756288
[650]	training's multi_logloss: 0.71982	valid_1's multi_logloss: 0.741009
[700]	training's multi_logloss: 0.70515	valid_1's multi_logloss: 0.727691
[750]	training's multi_logloss: 0.692075	valid_1's multi_logloss: 0.71593
[800]	training's multi_logloss: 0.680363	valid_1's multi_logloss: 0.705649
[850]	training's multi_logloss: 0.669844	valid_1's multi_logloss: 0.696422
[900]	training's multi_logloss

In [7]:
y_prediction_xgb_and_lgb= (y_predict + y_pred) / 2

score = logloss_mc(y_true, y_prediction_xgb_and_lgb)

print(" -- Multiclass logloss on xgboost and lgb validation set: {:.4f}.".format(score))

 -- Multiclass logloss on xgboost and lgb validation set: 0.5237.


In [8]:
test_prediction_xgb_and_lgb= (test_predictions + test_predict) / 2
#output result
output = pd.DataFrame(test_prediction_xgb_and_lgb)
output.columns = ['c1', 'c2', 'c3', 'c4','c5','c6','c7','c8','c9']
output.to_csv('SK_submission_xgb_lgb.csv', index = False)

print(" - XGB and LGB are Finished.")

 - XGB and LGB are Finished.
