### Load Modules and Data

In [1]:
import numpy as np
import pandas as pd

import os
import json

from sklearn.model_selection import train_test_split

In [2]:
DATA_IN_PATH = './txt_sim/'
DATA_OUT_PATH = './txt_sim/data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'

# Load Train data
train_q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
train_q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
train_labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))

In [9]:
RANDOM_SEED = 42
NUM_ROUNDS = 1000

In [3]:
train_q1_data.shape

(298526, 31)

In [4]:
train_input = np.stack((train_q1_data, train_q2_data), axis=1)
print(train_input.shape)

(298526, 2, 31)


#### Split Train, Validation Data

In [6]:
train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_labels, test_size=0.2, random_state = RANDOM_SEED)

#### Import XGBoost Module

In [7]:
import xgboost as xgb

#### Model Implementation and Training

In [8]:
train_data = xgb.DMatrix(train_input.sum(axis=1), label=train_label)
eval_data = xgb.DMatrix(eval_input.sum(axis=1), label=eval_label)

data_list = [(train_data,'train'),(eval_data,'valid')]

In [10]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metrics'] = 'rmse' # root mean square error

bst = xgb.train(params, train_data, num_boost_round=NUM_ROUNDS, evals=data_list, early_stopping_rounds=10)

Parameters: { "eval_metrics" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-logloss:0.66114	valid-logloss:0.66181
[1]	train-logloss:0.64072	valid-logloss:0.64212
[2]	train-logloss:0.62679	valid-logloss:0.62873
[3]	train-logloss:0.61696	valid-logloss:0.61933
[4]	train-logloss:0.60958	valid-logloss:0.61255
[5]	train-logloss:0.60386	valid-logloss:0.60725
[6]	train-logloss:0.59929	valid-logloss:0.60305
[7]	train-logloss:0.59434	valid-logloss:0.59818
[8]	train-logloss:0.58868	valid-logloss:0.59256
[9]	train-logloss:0.58596	valid-logloss:0.58994
[10]	train-logloss:0.58385	valid-logloss:0.58806
[11]	train-logloss:0.58161	valid-logloss:0.58627
[12]	train-logloss:0.58001	valid-logloss:0.58498
[13]	train-logloss:0.57612	valid-logloss:0.58145
[14]	

#### Load Test Data and Predict

In [11]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'), allow_pickle=True)

In [12]:
# Predict
test_input = np.stack((test_q1_data, test_q2_data), axis=1)
test_data = xgb.DMatrix(test_input.sum(axis=1))
test_predict = bst.predict(test_data)

In [14]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

output = pd.DataFrame({'test_id':test_id_data, 'is_duplicated':test_predict})
output.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)

#### Submit Output