In [1]:
import os
import numpy as np
import pandas as pd
import time
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (classification_report, accuracy_score,
                             precision_score, recall_score, f1_score)

# Prepare article bodies for training and test

In [2]:
print("Loading article bodies for training...")
train_articles_dir = '../output'
train_articles_df = pd.read_csv('input/train_v2.csv', index_col='article_id')
train_article_ids = []
train_article_bodies = []
train_article_classes = []
for article in os.listdir(train_articles_dir):
    article_id = article.split('.')[0]
    if article_id.isdigit():
        with open(os.path.join(train_articles_dir, article), 'r') as f:
            paragraphs = f.readlines()
        f.close()
        if paragraphs:
            train_article_ids.append(article_id)
            train_article_bodies.append(' '.join(paragraphs))
            train_article_classes.append(train_articles_df.loc[int(article_id), 'category'])
        
print("%s articles loaded for training." %len(train_article_ids))

Loading article bodies for training...
4566 articles loaded for training.


### Split into training and validation dataset

In [3]:
train_article_bodies = np.array(train_article_bodies)
train_article_classes = np.array(train_article_classes)
x_train, x_val, y_train, y_val = train_test_split(train_article_bodies, train_article_classes, test_size = 0.2)

# Build baseline model

In [4]:
print("Constructing TF-IDF matrix for articles.")
start_time = time.time()
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), max_features=3500)
tfidf_vect_ngram.fit(train_article_bodies)
train_tfidf_ngram = tfidf_vect_ngram.transform(x_train)
val_tfidf_ngram = tfidf_vect_ngram.transform(x_val)
base_model = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=140).fit(train_tfidf_ngram, y_train)
predicts = base_model.predict(train_tfidf_ngram)
print("Accuracy on training set %s" %round(accuracy_score(y_train, predicts), 4))
predicts = base_model.predict(val_tfidf_ngram)
print("Accuracy on validation set %s" %round(accuracy_score(y_val, predicts), 4))
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

Constructing TF-IDF matrix for articles.
Accuracy on training set 0.8927
Accuracy on validation set 0.6357
Elapsed time: %s seconds... 60.4177


# Predict on test data

In [5]:
test_articles_dir = '../output_test'
test_articles_df = pd.read_csv('input/test_v2.csv', index_col='article_id')
test_article_ids = []
test_article_bodies = []
test_article_classes = []
for article in os.listdir(test_articles_dir):
    article_id = article.split('.')[0]
    if article_id.isdigit():
        with open(os.path.join(test_articles_dir, article), 'r') as f:
            paragraphs = f.readlines()
        f.close()
        if paragraphs:
            test_article_ids.append(article_id)
            test_article_bodies.append(' '.join(paragraphs))
        
print("%s articles loaded for training." %len(test_article_ids))

2893 articles loaded for training.


In [6]:
train_article_bodies = np.array(train_article_bodies)
train_article_classes = np.array(train_article_classes)

print("Constructing TF-IDF matrix for articles.")
train_tfidf_ngram = tfidf_vect_ngram.transform(train_article_bodies)

start_time = time.time()
model = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=140).fit(train_tfidf_ngram, train_article_classes)
predicts = base_model.predict(train_tfidf_ngram)
print("Accuracy on training set %s" %round(accuracy_score(train_article_classes, predicts), 4))

Constructing TF-IDF matrix for articles.
Accuracy on training set 0.8412


In [7]:
test_tfidf_ngram = tfidf_vect_ngram.transform(test_article_bodies)
test_predicts = model.predict(test_tfidf_ngram)
print(test_predicts)
np.bincount(test_predicts)

[2 4 0 ... 4 4 4]


array([ 224,  115,  716,  155, 1683])

In [8]:
prob_predicts = model.predict_proba(test_tfidf_ngram)
prob_predicts_df = pd.DataFrame(data=prob_predicts, index=test_article_ids)
#prob_predicts_df.to_csv('../predictions/article_body_prediction.csv')
prob_predicts_df.head()

Unnamed: 0,0,1,2,3,4
1768,0.013928,0.005259,0.934608,0.037846,0.00836
3667,0.06689,0.164792,0.193029,0.045876,0.529412
2339,0.892867,0.002526,0.040985,0.006861,0.056761
2463,0.125468,0.017371,0.494597,0.049553,0.31301
111,0.075928,0.051367,0.139156,0.047051,0.686499


# Merge training set with confident test set prediction

In [13]:
prob_predicts = model.predict_proba(test_tfidf_ngram)
prob_mask = np.amax(prob_predicts, axis=1) > 0.6
selected_test_sample_ids = [article_id[0] for article_id in np.argwhere(prob_mask)]
test_articles_df.head()
# selected_test_articles = test_articles_df.loc[selected_test_sample_ids]
# selected_test_articles['category'] = np.take(test_predicts, selected_test_sample_ids)
# id_mapper = {}
# for i in selected_test_articles.index:
#     id_mapper[i] = 'x' + str(i)
# selected_test_articles = selected_test_articles.rename(id_mapper)
# selected_test_articles.head()

Unnamed: 0_level_0,title,url,publisher,hostname,timestamp
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,White House plays down speedy role for US natu...,http://www.thestar.com.my/News/World/2014/03/0...,The Star Online,www.thestar.com.my,1390000000000.0
2,Asian Stocks Broadly Higher After Selloff,http://www.nasdaq.com/article/asian-stocks-bro...,NASDAQ,www.nasdaq.com,1390000000000.0
3,Herbalife Ltd. (HLF) Probe Earns Bill Ackman B...,http://www.valuewalk.com/2014/03/herbalife-ltd...,ValueWalk,www.valuewalk.com,1390000000000.0
4,BOE to Get Fourth Deputy Governor as Carney Fi...,http://www.businessweek.com/news/2014-03-11/bo...,Businessweek,www.businessweek.com,1390000000000.0
5,Pilots get scrutiny,http://www.dispatch.com/content/stories/nation...,Columbus Dispatch,www.dispatch.com,1400000000000.0


(2893, 5)
(933, 5)


[[0.0, 0.0, 1.0],
 [0.01, 0.0, 0.99],
 [0.02, 0.0, 0.98],
 [0.03, 0.0, 0.97],
 [0.04, 0.0, 0.96]]

array([[0.2538004 , 0.03950063, 0.21367529, 0.02257236, 0.47045133],
       [0.07819359, 0.15902844, 0.2682253 , 0.06086434, 0.43368834],
       [0.07713429, 0.04451456, 0.24779618, 0.06003981, 0.57051516],
       ...,
       [0.0265868 , 0.02076217, 0.08952271, 0.67264193, 0.19048634],
       [0.02334936, 0.6398452 , 0.11418417, 0.03444034, 0.18818094],
       [0.8091364 , 0.01016325, 0.07695226, 0.01566792, 0.08808017]])