In [None]:
import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [None]:
train = pd.read_csv("train_sample.csv")
test = pd.read_csv("test_sample.csv")

In [None]:
train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,class
0,0.746933,0.161885,0.140955,0.583561,0.144622,0.336816,0.264085,1
1,0.685185,0.543761,0.123296,0.692514,0.291902,0.0054,0.403936,0
2,0.245476,0.051729,0.269197,0.864631,0.055845,0.226046,0.643894,0
3,0.284447,0.052023,0.19453,0.554029,0.204384,0.674742,0.584669,0
4,0.451972,0.858497,0.320467,0.665244,0.427009,0.662678,0.834946,1


In [None]:
test.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7
0,0.703602,0.291032,0.973082,0.11122,0.486934,0.565759,0.386211
1,0.016245,0.050978,0.476517,0.472171,0.686417,0.113717,0.655804
2,0.452641,0.201385,0.352279,0.938298,0.525925,0.171441,0.857071
3,0.628356,0.267784,0.186499,0.208748,0.780065,0.097353,0.475235
4,0.849251,0.957766,0.093711,0.256294,0.870455,0.068927,0.320942


In [None]:
X_train, y_train = train.drop('class', axis=1), train['class']

In [None]:
param = {'objective': 'binary:logistic', 'eval_metric': 'auc'}

cross_val = xgb.cv(
    params=param,
    dtrain=xgb.DMatrix(X_train, label=y_train),
    seed=0,
    nfold=5,
    num_boost_round=100
)

In [None]:
best_rounds = np.argmax(cross_val['test-auc-mean'])
print(best_rounds, np.max(cross_val['test-auc-mean']))

24 0.9393060705367728


In [None]:
model = xgb.train(param, xgb.DMatrix(X_train, label=y_train), num_boost_round=best_rounds)
y_pred = model.predict(xgb.DMatrix(test))

In [None]:
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)

In [None]:
model.feature_importances_

array([0.57836221, 0.06427951, 0.06629757, 0.07772673, 0.06671917,
       0.07818515, 0.06842967])

In [None]:
t = pd.Series(model.feature_importances_, index=range(1,8))
rf_most_important = t.sort_values(ascending=False).index
rf_most_important

Int64Index([1, 6, 4, 7, 5, 3, 2], dtype='int64')

In [None]:
np.savez('submission.npz', rf_most_important=rf_most_important[0],
         prediction=y_pred)

In [None]:
data = np.load('submission.npz')
lst = data.files
for item in lst:
    print(item)
    print(data[item])

rf_most_important
1
prediction
[0.994103   0.01267427 0.9957781  0.9112735  0.87001777 0.06064529
 0.01179479 0.00875699 0.9909697  0.04080543 0.99591714 0.9782938
 0.00567097 0.9949314  0.1803479  0.01792809 0.9983175  0.02464544
 0.99408567 0.99678934 0.00309285 0.99543744 0.9929877  0.7400381
 0.91603863 0.9882901  0.14201933 0.98458004 0.07420897 0.5437598
 0.89169204 0.99608386 0.01646675 0.99687517 0.17823173 0.98641336
 0.03268986 0.02283901 0.996741   0.61895025 0.9604924  0.98894626
 0.43994385 0.12661028 0.21009548 0.05027382 0.9944013  0.06243235
 0.9938677  0.9813793  0.181149   0.97952497 0.06839682 0.00664827
 0.0342198  0.9975153  0.9108474  0.98865885 0.25719285 0.99223644
 0.97787696 0.03898548 0.9601598  0.98909163 0.19721024 0.0089342
 0.855233   0.9928295  0.8348242  0.9975756  0.9941695  0.95225513
 0.00261485 0.9813571  0.5172491  0.0056642  0.11035867 0.04053485
 0.9950759  0.01500657 0.9928197  0.92088705 0.01873994 0.9943013
 0.9797499  0.05535256 0.9883463  0.

In [None]:
npzfile = np.load('submission.npz')
print(npzfile['rf_most_important'],npzfile['prediction'].shape)

1 (500,)
