In [1]:
import os
import numpy as np
import pandas as pd
import time
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (classification_report, accuracy_score,
                             precision_score, recall_score, f1_score)

## Test baseline model

In [52]:
print("Loading article titles and their labels.")
articles_info_df = pd.read_csv('input/train_v2.csv', index_col='article_id')
article_titles = articles_info_df['title'].tolist()
article_classes = articles_info_df['category'].tolist()

print("Constructing TF-IDF matrix for articles.")
x = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                    ngram_range=(1,3), max_features=3500).fit_transform(article_titles)
# x = TfidfVectorizer.fit_transform(article_titles)
y = np.array(article_classes)

print("Dimension of TF-IDF matrix: ", x.shape)
print("Start training classifier...")

# Split training and validation datasets
# x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .2, random_state=12, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .2, random_state=12)
#model = MultinomialNB().fit(x[train], y[train])
model1 = XGBClassifier(max_depth=5, learning_rate=0.1,
                       n_estimators=140).fit(x_train, y_train)
predicts = model1.predict(x_val)

# print("Precision: %s" %round(precision_score(y_val, predicts, average='macro'), 4))
# print("Recall: %s" %round(recall_score(y_val, predicts, average='macro'), 4))
# print("F1 score: %s" %round(f1_score(y_val, predicts, average='macro'), 4))
print("Accuracy: %s" %round(accuracy_score(y_val, predicts), 4))

Loading article titles and their labels.
Constructing TF-IDF matrix for articles.
Dimension of TF-IDF matrix:  (6027, 3500)
Start training classifier...
Training accuracy:  0.6716417910447762
Accuracy: 0.6716


## Tune max_depth and min_child_weight

In [6]:
param_test1 = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 6, 2)
}
# y_train_bi = label_binarize(y, classes=[0, 1, 2, 3])
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective='multi:softprob',
                                                  nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test1, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch1.fit(x_train, y_train)
print(gsearch1.cv_results_['mean_test_score'])
print(gsearch1.cv_results_['params'])
print(gsearch1.best_params_)
print(gsearch1.best_score_)

[0.63761654 0.63969125 0.63409086 0.64134413 0.6384516  0.63450236
 0.64030958 0.63969212 0.63512843 0.64300432 0.63886396 0.63928019]
[{'max_depth': 3, 'min_child_weight': 1}, {'max_depth': 3, 'min_child_weight': 3}, {'max_depth': 3, 'min_child_weight': 5}, {'max_depth': 5, 'min_child_weight': 1}, {'max_depth': 5, 'min_child_weight': 3}, {'max_depth': 5, 'min_child_weight': 5}, {'max_depth': 7, 'min_child_weight': 1}, {'max_depth': 7, 'min_child_weight': 3}, {'max_depth': 7, 'min_child_weight': 5}, {'max_depth': 9, 'min_child_weight': 1}, {'max_depth': 9, 'min_child_weight': 3}, {'max_depth': 9, 'min_child_weight': 5}]
{'max_depth': 9, 'min_child_weight': 1}
0.6430043153392759


In [7]:
param_test2 = {
    'max_depth': [8, 9, 10, 12],
    'min_child_weight': [0.1, 0.5, 1, 2]
}
start_time = time.time()
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective='multi:softprob',
                                                  nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test2, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch2.fit(x_train, y_train)
print(gsearch2.cv_results_['mean_test_score'])
print(gsearch2.cv_results_['params'])
print(gsearch2.best_params_)
print(gsearch2.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.64424571 0.64154989 0.64403887 0.63906884 0.6415501  0.64072237
 0.64300432 0.63989743 0.63988969 0.64362866 0.64383376 0.63844858
 0.64217229 0.64030808 0.64611314 0.64384171]
[{'max_depth': 8, 'min_child_weight': 0.1}, {'max_depth': 8, 'min_child_weight': 0.5}, {'max_depth': 8, 'min_child_weight': 1}, {'max_depth': 8, 'min_child_weight': 2}, {'max_depth': 9, 'min_child_weight': 0.1}, {'max_depth': 9, 'min_child_weight': 0.5}, {'max_depth': 9, 'min_child_weight': 1}, {'max_depth': 9, 'min_child_weight': 2}, {'max_depth': 10, 'min_child_weight': 0.1}, {'max_depth': 10, 'min_child_weight': 0.5}, {'max_depth': 10, 'min_child_weight': 1}, {'max_depth': 10, 'min_child_weight': 2}, {'max_depth': 12, 'min_child_weight': 0.1}, {'max_depth': 12, 'min_child_weight': 0.5}, {'max_depth': 12, 'min_child_weight': 1}, {'max_depth': 12, 'min_child_weight': 2}]
{'max_depth': 12, 'min_child_weight': 1}
0.6461131418975062
Elapsed time: %s seconds... 343.8405


## Tune gamma

In [8]:
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
start_time = time.time()
gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective='multi:softprob',
                                                  nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test3, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch3.fit(x_train, y_train)
print(gsearch3.cv_results_['mean_test_score'])
print(gsearch3.cv_results_['params'])
print(gsearch3.best_params_)
print(gsearch3.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.64134413 0.64134564 0.64196998 0.64176251 0.64155461]
[{'gamma': 0.0}, {'gamma': 0.1}, {'gamma': 0.2}, {'gamma': 0.3}, {'gamma': 0.4}]
{'gamma': 0.2}
0.6419699764009
Elapsed time: %s seconds... 60.0255


## Tune colsample_bytree and subsample

In [9]:
param_test4 = {
    'subsample':[i/10.0 for i in range(6, 10)],
    'colsample_bytree':[i/10.0 for i in range(6, 10)]
}
start_time = time.time()
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective='multi:softprob',
                                                  nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test4, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch4.fit(x_train, y_train)
print(gsearch4.cv_results_['mean_test_score'])
print(gsearch4.cv_results_['params'])
print(gsearch4.best_params_)
print(gsearch4.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.64010812 0.64176273 0.6400948  0.6425954  0.63989893 0.6394855
 0.63906326 0.63781823 0.63947754 0.64093328 0.64134413 0.63989313
 0.63906648 0.63907056 0.64176293 0.63968631]
[{'colsample_bytree': 0.6, 'subsample': 0.6}, {'colsample_bytree': 0.6, 'subsample': 0.7}, {'colsample_bytree': 0.6, 'subsample': 0.8}, {'colsample_bytree': 0.6, 'subsample': 0.9}, {'colsample_bytree': 0.7, 'subsample': 0.6}, {'colsample_bytree': 0.7, 'subsample': 0.7}, {'colsample_bytree': 0.7, 'subsample': 0.8}, {'colsample_bytree': 0.7, 'subsample': 0.9}, {'colsample_bytree': 0.8, 'subsample': 0.6}, {'colsample_bytree': 0.8, 'subsample': 0.7}, {'colsample_bytree': 0.8, 'subsample': 0.8}, {'colsample_bytree': 0.8, 'subsample': 0.9}, {'colsample_bytree': 0.9, 'subsample': 0.6}, {'colsample_bytree': 0.9, 'subsample': 0.7}, {'colsample_bytree': 0.9, 'subsample': 0.8}, {'colsample_bytree': 0.9, 'subsample': 0.9}]
{'colsample_bytree': 0.6, 'subsample': 0.9}
0.6425953996668066


In [10]:
param_test5 = {
    'subsample':[i/10.0 for i in range(3, 7)],
    'colsample_bytree':[i/10.0 for i in range(8, 11)]
}
start_time = time.time()
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective='multi:softprob',
                                                  nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test5, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch5.fit(x_train, y_train)
print(gsearch5.cv_results_['mean_test_score'])
print(gsearch5.cv_results_['params'])
print(gsearch5.best_params_)
print(gsearch5.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.64322211 0.64031366 0.63741101 0.63947754 0.64094403 0.63658373
 0.63927566 0.63906648 0.64136155 0.63927695 0.63948292 0.63678753]
[{'colsample_bytree': 0.8, 'subsample': 0.3}, {'colsample_bytree': 0.8, 'subsample': 0.4}, {'colsample_bytree': 0.8, 'subsample': 0.5}, {'colsample_bytree': 0.8, 'subsample': 0.6}, {'colsample_bytree': 0.9, 'subsample': 0.3}, {'colsample_bytree': 0.9, 'subsample': 0.4}, {'colsample_bytree': 0.9, 'subsample': 0.5}, {'colsample_bytree': 0.9, 'subsample': 0.6}, {'colsample_bytree': 1.0, 'subsample': 0.3}, {'colsample_bytree': 1.0, 'subsample': 0.4}, {'colsample_bytree': 1.0, 'subsample': 0.5}, {'colsample_bytree': 1.0, 'subsample': 0.6}]
{'colsample_bytree': 0.8, 'subsample': 0.3}
0.6432221057615536


 ## Tuning Regularization Parameters

In [12]:
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
start_time = time.time()
gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective='multi:softprob',
                                                  nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test6, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch6.fit(x_train, y_train)
print(gsearch6.cv_results_['mean_test_score'])
print(gsearch6.cv_results_['params'])
print(gsearch6.best_params_)
print(gsearch6.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.64134413 0.63989528 0.63989593 0.63844536 0.45239661]
[{'reg_alpha': 1e-05}, {'reg_alpha': 0.01}, {'reg_alpha': 0.1}, {'reg_alpha': 1}, {'reg_alpha': 100}]
{'reg_alpha': 1e-05}
0.6413441267225569
Elapsed time: %s seconds... 53.7113


In [13]:
param_test7 = {
    'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
start_time = time.time()
gsearch7 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=1, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective='multi:softprob',
                                                  nthread=4, scale_pos_weight=1, seed=27),
                        param_grid = param_test7, n_jobs=4, scoring='accuracy', iid=False, cv=5)
gsearch7.fit(x_train, y_train)
print(gsearch7.cv_results_['mean_test_score'])
print(gsearch7.cv_results_['params'])
print(gsearch7.best_params_)
print(gsearch7.best_score_)
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

[0.64134413 0.64197041 0.64010211 0.63989528 0.63927138]
[{'reg_alpha': 0}, {'reg_alpha': 0.001}, {'reg_alpha': 0.005}, {'reg_alpha': 0.01}, {'reg_alpha': 0.05}]
{'reg_alpha': 0.001}
0.6419704077305247
Elapsed time: %s seconds... 63.778


## Apply classifier with tuned parameters

In [20]:
## Optimized parameters
opt_max_depth = 9
opt_min_child_weight = 1
opt_gamma = 0.2
opt_colsample_bytree = 0.6
opt_subsample = 0.9
opt_reg_alpha = 0.001

In [34]:
start_time = time.time()
model2 = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=opt_max_depth, 
                       min_child_weight=opt_min_child_weight, gamma=opt_gamma, subsample=opt_subsample,
                       colsample_bytree=opt_colsample_bytree, reg_alpha=opt_reg_alpha,
                       nthread=4, scale_pos_weight=1, seed=27).fit(x_train, y_train)
predicts = model2.predict(x_val)
print("Precision: %s" %round(precision_score(y_val, predicts, average='macro'), 4))
print("Recall: %s" %round(recall_score(y_val, predicts, average='macro'), 4))
print("F1 score: %s" %round(f1_score(y_val, predicts, average='macro'), 4))
print("Accuracy: %s" %round(accuracy_score(y_val, predicts), 4))
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

Precision: 0.7535
Recall: 0.5396
F1 score: 0.604
Accuracy: 0.6542
Elapsed time: %s seconds... 8.9601


## Predict on test dataset

In [55]:
test_articles_df = pd.read_csv('input/test_v2.csv', index_col='article_id')
test_articles_df.head()

Unnamed: 0_level_0,title,url,publisher,hostname,timestamp
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,White House plays down speedy role for US natu...,http://www.thestar.com.my/News/World/2014/03/0...,The Star Online,www.thestar.com.my,1390000000000.0
2,Asian Stocks Broadly Higher After Selloff,http://www.nasdaq.com/article/asian-stocks-bro...,NASDAQ,www.nasdaq.com,1390000000000.0
3,Herbalife Ltd. (HLF) Probe Earns Bill Ackman B...,http://www.valuewalk.com/2014/03/herbalife-ltd...,ValueWalk,www.valuewalk.com,1390000000000.0
4,BOE to Get Fourth Deputy Governor as Carney Fi...,http://www.businessweek.com/news/2014-03-11/bo...,Businessweek,www.businessweek.com,1390000000000.0
5,Pilots get scrutiny,http://www.dispatch.com/content/stories/nation...,Columbus Dispatch,www.dispatch.com,1400000000000.0


In [53]:
x_test = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3),
                         max_features=3500).fit_transform(test_articles_df['title'].tolist())
test_predicts = model1.predict(x_test)
test_predicts_df = pd.DataFrame(data=test_predicts, columns=['category'], index=test_articles_df.index)
test_predicts_df.to_csv('title_prediction.csv')

In [56]:
np.bincount(test_predicts)

array([  61,  481,  168,   73, 3043])

## Reproduce existing model

In [65]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), max_features=3500)
tfidf_vect_ngram.fit(articles_info_df["title"])
train_tfidf_ngram = tfidf_vect_ngram.transform(articles_info_df["title"])
test_tfidf_ngram = tfidf_vect_ngram.transform(test_articles_df["title"])
# model3 = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=140).fit(train_tfidf_ngram.to_csc(),
#                                                                              articles_info_df['category'])
model3 = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=140).fit(x_train, y_train)
predicts = model3.predict(x_train)
print("Accuracy on training set %s" %round(accuracy_score(y_train, predicts), 4))
predicts = model3.predict(x_val)
print("Accuracy on validation set %s" %round(accuracy_score(y_val, predicts), 4))

Accuracy on training set 0.742
Accuracy on validation set 0.6716


In [68]:
test_articles_df['category'] = model3.predict(test_tfidf_ngram)
test_predicts_df = pd.DataFrame(test_articles_df, columns=['category'])
test_predicts_df.to_csv('../predictions/title_prediction.csv')

In [69]:
np.bincount(test_articles_df['category'])

array([ 645,  217,  738,  260, 1966])