In [1]:
import os
import numpy as np
import pandas as pd
import time
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (classification_report, accuracy_score,
                             precision_score, recall_score, f1_score)

# Prepare training and test data

In [2]:
print("Loading article titles and their labels.")
train_articles_df = pd.read_csv('input/train_v2.csv', index_col='article_id')
test_articles_df = pd.read_csv('input/test_v2.csv', index_col='article_id')

x_train, x_val, y_train, y_val = train_test_split(train_articles_df["title"], train_articles_df["category"],
                                                  test_size = 0.2)
x_train = np.array(x_train)
x_val = np.array(x_val)
y_train = np.array(y_train)
y_val = np.array(y_val)

Loading article titles and their labels.


# Build baseline model

In [4]:
print("Constructing TF-IDF matrix for articles.")
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), max_features=3500)
tfidf_vect_ngram.fit(train_articles_df["title"])
train_tfidf_ngram = tfidf_vect_ngram.transform(x_train)
val_tfidf_ngram = tfidf_vect_ngram.transform(x_val)
base_model = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=140).fit(train_tfidf_ngram, y_train)
predicts = base_model.predict(train_tfidf_ngram)
print("Accuracy on training set %s" %round(accuracy_score(y_train, predicts), 4))
predicts = base_model.predict(val_tfidf_ngram)
print("Accuracy on validation set %s" %round(accuracy_score(y_val, predicts), 4))

Constructing TF-IDF matrix for articles.
Accuracy on training set 0.7451
Accuracy on validation set 0.6625


# Parameter Tuning
### max_depth and min_child_weight

In [4]:
param_test1 = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 6, 2)
}
# y_train_bi = label_binarize(y, classes=[0, 1, 2, 3])
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test1, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch1.fit(train_tfidf_ngram, y_train)
print(gsearch1.cv_results_['mean_test_score'])
print(gsearch1.cv_results_['params'])
print(gsearch1.best_params_)
print(gsearch1.best_score_)

[0.64570588 0.64383736 0.63740668 0.6486098  0.64798438 0.64342092
 0.65213592 0.64881426 0.64425403 0.64819766 0.64529417 0.64072662]
[{'max_depth': 3, 'min_child_weight': 1}, {'max_depth': 3, 'min_child_weight': 3}, {'max_depth': 3, 'min_child_weight': 5}, {'max_depth': 5, 'min_child_weight': 1}, {'max_depth': 5, 'min_child_weight': 3}, {'max_depth': 5, 'min_child_weight': 5}, {'max_depth': 7, 'min_child_weight': 1}, {'max_depth': 7, 'min_child_weight': 3}, {'max_depth': 7, 'min_child_weight': 5}, {'max_depth': 9, 'min_child_weight': 1}, {'max_depth': 9, 'min_child_weight': 3}, {'max_depth': 9, 'min_child_weight': 5}]
{'max_depth': 7, 'min_child_weight': 1}
0.652135918632812


In [6]:
param_test2 = {
    'max_depth': [4, 5, 6, 7, 8],
    'min_child_weight': [0.1, 0.5, 1, 2]
}
start_time = time.time()
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test2, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch2.fit(train_tfidf_ngram, y_train)
print(gsearch2.cv_results_['mean_test_score'])
print(gsearch2.cv_results_['params'])
print(gsearch2.best_params_)
print(gsearch2.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.64777928 0.64860915 0.64570738 0.64259211 0.64964672 0.65296773
 0.6486098  0.64757116 0.65047336 0.6506804  0.65068169 0.64860808
 0.64985203 0.65296493 0.65213592 0.65068536 0.6508967  0.6496493
 0.64944614 0.65151523]
[{'max_depth': 4, 'min_child_weight': 0.1}, {'max_depth': 4, 'min_child_weight': 0.5}, {'max_depth': 4, 'min_child_weight': 1}, {'max_depth': 4, 'min_child_weight': 2}, {'max_depth': 5, 'min_child_weight': 0.1}, {'max_depth': 5, 'min_child_weight': 0.5}, {'max_depth': 5, 'min_child_weight': 1}, {'max_depth': 5, 'min_child_weight': 2}, {'max_depth': 6, 'min_child_weight': 0.1}, {'max_depth': 6, 'min_child_weight': 0.5}, {'max_depth': 6, 'min_child_weight': 1}, {'max_depth': 6, 'min_child_weight': 2}, {'max_depth': 7, 'min_child_weight': 0.1}, {'max_depth': 7, 'min_child_weight': 0.5}, {'max_depth': 7, 'min_child_weight': 1}, {'max_depth': 7, 'min_child_weight': 2}, {'max_depth': 8, 'min_child_weight': 0.1}, {'max_depth': 8, 'min_child_weight': 0.5}, {'max_depth': 8, 

### gamma

In [7]:
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
start_time = time.time()
gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test3, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch3.fit(train_tfidf_ngram, y_train)
print(gsearch3.cv_results_['mean_test_score'])
print(gsearch3.cv_results_['params'])
print(gsearch3.best_params_)
print(gsearch3.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.6486098  0.65047616 0.64902647 0.64923092 0.64881684]
[{'gamma': 0.0}, {'gamma': 0.1}, {'gamma': 0.2}, {'gamma': 0.3}, {'gamma': 0.4}]
{'gamma': 0.1}
0.6504761649136339
Elapsed time: %s seconds... 238.445


### colsample_bytree and subsample

In [10]:
param_test4 = {
    'subsample':[i/10.0 for i in range(6, 10)],
    'colsample_bytree':[i/10.0 for i in range(6, 10)]
}
start_time = time.time()
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test4, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch4.fit(train_tfidf_ngram, y_train)
print(gsearch4.cv_results_['mean_test_score'])
print(gsearch4.cv_results_['params'])
print(gsearch4.best_params_)
print(gsearch4.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.64944011 0.65026934 0.6521385  0.65359036 0.65005821 0.64861497
 0.65068363 0.65192866 0.64529115 0.64653963 0.6486098  0.6486083
 0.64632764 0.65171883 0.64943925 0.64860744]
[{'colsample_bytree': 0.6, 'subsample': 0.6}, {'colsample_bytree': 0.6, 'subsample': 0.7}, {'colsample_bytree': 0.6, 'subsample': 0.8}, {'colsample_bytree': 0.6, 'subsample': 0.9}, {'colsample_bytree': 0.7, 'subsample': 0.6}, {'colsample_bytree': 0.7, 'subsample': 0.7}, {'colsample_bytree': 0.7, 'subsample': 0.8}, {'colsample_bytree': 0.7, 'subsample': 0.9}, {'colsample_bytree': 0.8, 'subsample': 0.6}, {'colsample_bytree': 0.8, 'subsample': 0.7}, {'colsample_bytree': 0.8, 'subsample': 0.8}, {'colsample_bytree': 0.8, 'subsample': 0.9}, {'colsample_bytree': 0.9, 'subsample': 0.6}, {'colsample_bytree': 0.9, 'subsample': 0.7}, {'colsample_bytree': 0.9, 'subsample': 0.8}, {'colsample_bytree': 0.9, 'subsample': 0.9}]
{'colsample_bytree': 0.6, 'subsample': 0.9}
0.6535903587725659
Elapsed time: %s seconds... 689.6296


In [11]:
param_test5 = {
    'subsample':[i/10.0 for i in range(3, 7)],
    'colsample_bytree':[i/10.0 for i in range(8, 11)]
}
start_time = time.time()
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test5, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch5.fit(train_tfidf_ngram, y_train)
print(gsearch5.cv_results_['mean_test_score'])
print(gsearch5.cv_results_['params'])
print(gsearch5.best_params_)
print(gsearch5.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.63927799 0.64238508 0.6463326  0.64529115 0.64072985 0.64425424
 0.64467133 0.64632764 0.63990126 0.64425833 0.6423868  0.6498516 ]
[{'colsample_bytree': 0.8, 'subsample': 0.3}, {'colsample_bytree': 0.8, 'subsample': 0.4}, {'colsample_bytree': 0.8, 'subsample': 0.5}, {'colsample_bytree': 0.8, 'subsample': 0.6}, {'colsample_bytree': 0.9, 'subsample': 0.3}, {'colsample_bytree': 0.9, 'subsample': 0.4}, {'colsample_bytree': 0.9, 'subsample': 0.5}, {'colsample_bytree': 0.9, 'subsample': 0.6}, {'colsample_bytree': 1.0, 'subsample': 0.3}, {'colsample_bytree': 1.0, 'subsample': 0.4}, {'colsample_bytree': 1.0, 'subsample': 0.5}, {'colsample_bytree': 1.0, 'subsample': 0.6}]
{'colsample_bytree': 1.0, 'subsample': 0.6}
0.6498516002925523
Elapsed time: %s seconds... 520.1734


### regularization parameter

In [14]:
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
start_time = time.time()
gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test6, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch6.fit(train_tfidf_ngram, y_train)
print(gsearch6.cv_results_['mean_test_score'])
print(gsearch6.cv_results_['params'])
print(gsearch6.best_params_)
print(gsearch6.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.6486098  0.64902388 0.65026805 0.64757332 0.48288454]
[{'reg_alpha': 1e-05}, {'reg_alpha': 0.01}, {'reg_alpha': 0.1}, {'reg_alpha': 1}, {'reg_alpha': 100}]
{'reg_alpha': 0.1}
0.6502680470302425
Elapsed time: %s seconds... 228.2328


In [15]:
param_test7 = {
    'reg_alpha':[0, 0.01, 0.05, 0.1, 0.5]
}
start_time = time.time()
gsearch7 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test7, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch7.fit(train_tfidf_ngram, y_train)
print(gsearch7.cv_results_['mean_test_score'])
print(gsearch7.cv_results_['params'])
print(gsearch7.best_params_)
print(gsearch7.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.6486098  0.64902388 0.6523408  0.65026805 0.6477782 ]
[{'reg_alpha': 0}, {'reg_alpha': 0.01}, {'reg_alpha': 0.05}, {'reg_alpha': 0.1}, {'reg_alpha': 0.5}]
{'reg_alpha': 0.05}
0.6523408022249206
Elapsed time: %s seconds... 237.7314


## Apply classifier with tuned parameters

In [6]:
## Optimized parameters
opt_max_depth = 5
opt_min_child_weight = 0.5
opt_gamma = 0.1
opt_colsample_bytree = 0.6
opt_subsample = 0.9
opt_reg_alpha = 0.05

In [7]:
start_time = time.time()
model2 = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=opt_max_depth, 
                       min_child_weight=opt_min_child_weight, gamma=opt_gamma, subsample=opt_subsample,
                       colsample_bytree=opt_colsample_bytree, reg_alpha=opt_reg_alpha,
                       nthread=4, scale_pos_weight=1, seed=27).fit(train_tfidf_ngram, y_train)
predicts = model2.predict(train_tfidf_ngram)
print("Training accuracy: %s" %round(accuracy_score(y_train, predicts), 4))
predicts = model2.predict(val_tfidf_ngram)
print("Validation accuracy: %s" %round(accuracy_score(y_val, predicts), 4))
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

XGBoostError: b'[08:44:24] /workspace/src/objective/multiclass_obj.cu:61: Check failed: preds.Size() == (static_cast<size_t>(param_.num_class) * info.labels_.Size()) SoftmaxMultiClassObj: label size and pred size does not match\n\nStack trace returned 10 entries:\n[bt] (0) /home/swang/data_env/venv/xgboost/libxgboost.so(dmlc::StackTrace()+0x3d) [0x7f9150e415cd]\n[bt] (1) /home/swang/data_env/venv/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7f9150e419c8]\n[bt] (2) /home/swang/data_env/venv/xgboost/libxgboost.so(xgboost::obj::SoftmaxMultiClassObj::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0x152) [0x7f9151027aa2]\n[bt] (3) /home/swang/data_env/venv/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7f9150eb81e2]\n[bt] (4) /home/swang/data_env/venv/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7f9150e39ab5]\n[bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f919b332e40]\n[bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x2eb) [0x7f919b3328ab]\n[bt] (7) /home/swang/data_env/venv/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2cf) [0x7f919b546cff]\n[bt] (8) /home/swang/data_env/venv/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x8c39) [0x7f919b53dc39]\n[bt] (9) /home/swang/data_env/venv/bin/python3.6(_PyObject_FastCallDict+0xa2) [0x453412]\n\n'

## Predict on test dataset

In [8]:
train_tfidf_ngram = tfidf_vect_ngram.transform(train_articles_df["title"].str.lower())
test_tfidf_ngram = tfidf_vect_ngram.transform(test_articles_df["title"].str.lower())
start_time = time.time()
model3 = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=opt_max_depth, 
                       min_child_weight=opt_min_child_weight, gamma=opt_gamma, subsample=opt_subsample,
                       colsample_bytree=opt_colsample_bytree, reg_alpha=opt_reg_alpha,
                       nthread=4, scale_pos_weight=1, seed=27).fit(train_tfidf_ngram, train_articles_df["category"])
predicts = model3.predict(train_tfidf_ngram)
print("Training accuracy: %s" %round(accuracy_score(train_articles_df["category"], predicts), 4))
test_predicts = model3.predict(test_tfidf_ngram)
print(test_predicts)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

Training accuracy: 0.7445
[4 4 4 ... 1 4 0]
Elapsed time: %s seconds... 2.6204


In [9]:
np.bincount(test_predicts)

array([ 655,  207,  765,  259, 1940])

# Merge training set with confident test set prediction

In [26]:
prob_predicts = model3.predict_proba(test_tfidf_ngram)
prob_mask = np.amax(prob_predicts, axis=1) > 0.6
selected_test_sample_ids = [article_id[0] for article_id in np.argwhere(prob_mask)]
selected_test_articles = test_articles_df.loc[selected_test_sample_ids]
selected_test_articles['category'] = np.take(test_predicts, selected_test_sample_ids)
id_mapper = {}
for i in selected_test_articles.index:
    id_mapper[i] = 'x' + str(i)
selected_test_articles = selected_test_articles.rename(id_mapper)
selected_test_articles.head()

Unnamed: 0_level_0,title,url,publisher,hostname,timestamp,category
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
x5,Pilots get scrutiny,http://www.dispatch.com/content/stories/nation...,Columbus Dispatch,www.dispatch.com,1400000000000.0,4
x7,Stock futures down slightly; Coca-Cola cuts ex...,http://www.cleveland.com/business/index.ssf/20...,The Plain Dealer,www.cleveland.com,1390000000000.0,4
x8,Latest information on the investigation into t...,http://www.tribtown.com/view/story/db8d72c0706...,The Tribune,www.tribtown.com,1400000000000.0,4
x9,"Citi upgrades JC Penney, says it's a comeback ...",http://bizbeatblog.dallasnews.com/2014/03/citi...,Dallas Morning News \(blog\),bizbeatblog.dallasnews.com,1390000000000.0,4
x12,Official: Gunmen kill Afghan judge and bodyguard,http://www.postbulletin.com/news/world/officia...,Post-Bulletin,www.postbulletin.com,1400000000000.0,4


In [27]:
merged_articles_df = pd.concat([train_articles_df, selected_test_articles])
merged_articles_df.shape

(8500, 6)

In [29]:
kf = KFold(n_splits=10)
title_pred = np.empty([0, 5])
merged_tfidf_ngram = tfidf_vect_ngram.transform(merged_articles_df["title"].str.lower())
merged_articles_labels = merged_articles_df['category']
for train, test in kf.split(merged_tfidf_ngram):
    model = XGBClassifier(max_depth=5, learning_rate=0.1,
                          n_estimators=140).fit(merged_tfidf_ngram[train], merged_articles_labels[train])
    predicts = model.predict_proba(merged_tfidf_ngram[test])
    title_pred = np.concatenate((title_pred, predicts))

title_pred.shape

(8500, 5)

In [31]:
prob_predicts_df = pd.DataFrame(data=title_pred, index=merged_articles_df.index)
prob_predicts_df['category'] = merged_articles_labels
prob_predicts_df.head()

Unnamed: 0_level_0,0,1,2,3,4,category
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.099625,0.24582,0.215641,0.061784,0.377129,4
2,0.121561,0.053441,0.207784,0.133026,0.484187,2
3,0.132974,0.025772,0.25102,0.086864,0.50337,4
4,0.113787,0.033448,0.106611,0.052538,0.693615,4
5,0.264567,0.017151,0.294841,0.050877,0.372564,4


In [32]:
prob_predicts_df.to_csv('../predictions/title_prediction2.csv')