In [1]:
import os
import numpy as np
import pandas as pd
import time
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (classification_report, accuracy_score,
                             precision_score, recall_score, f1_score)

# Prepare article bodies for training and test

In [6]:
print("Loading article bodies for training...")
train_articles_dir = '../output'
train_articles_df = pd.read_csv('input/train_v2.csv', index_col='article_id')
train_article_ids = []
train_article_bodies = []
train_article_classes = []
for article in os.listdir(train_articles_dir):
    article_id = article.split('.')[0]
    if article_id.isdigit():
        with open(os.path.join(train_articles_dir, article), 'r') as f:
            paragraphs = f.readlines()
        f.close()
        if paragraphs:
            train_article_ids.append(article_id)
            train_article_bodies.append(' '.join(paragraphs))
            train_article_classes.append(train_articles_df.loc[int(article_id), 'category'])
        
print("%s articles loaded for training." %len(train_article_ids))

Loading article bodies for training...
4566 articles loaded for training.


### Split into training and validation dataset

In [8]:
train_article_bodies = np.array(train_article_bodies)
train_article_classes = np.array(train_article_classes)
x_train, x_val, y_train, y_val = train_test_split(train_article_bodies, train_article_classes, test_size = 0.2)

# Build baseline model

In [11]:
print("Constructing TF-IDF matrix for articles.")
start_time = time.time()
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), max_features=3500)
tfidf_vect_ngram.fit(train_article_bodies)
train_tfidf_ngram = tfidf_vect_ngram.transform(x_train)
val_tfidf_ngram = tfidf_vect_ngram.transform(x_val)
base_model = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=140).fit(train_tfidf_ngram, y_train)
predicts = base_model.predict(train_tfidf_ngram)
print("Accuracy on training set %s" %round(accuracy_score(y_train, predicts), 4))
predicts = base_model.predict(val_tfidf_ngram)
print("Accuracy on validation set %s" %round(accuracy_score(y_val, predicts), 4))
print("Elapsed time: %s seconds...", round(time.time() - start_time, 4))

Constructing TF-IDF matrix for articles.
Accuracy on training set 0.8883
Accuracy on validation set 0.6521
Elapsed time: %s seconds... 84.1054


# Predict on test data

In [18]:
test_articles_dir = '../output_test'
test_articles_df = pd.read_csv('input/test_v2.csv', index_col='article_id')
test_article_ids = []
test_article_bodies = []
test_article_classes = []
for article in os.listdir(test_articles_dir):
    article_id = article.split('.')[0]
    if article_id.isdigit():
        with open(os.path.join(test_articles_dir, article), 'r') as f:
            paragraphs = f.readlines()
        f.close()
        if paragraphs:
            test_article_ids.append(article_id)
            test_article_bodies.append(' '.join(paragraphs))
        
print("%s articles loaded for training." %len(test_article_ids))

2893 articles loaded for training.


In [19]:
train_article_bodies = np.array(train_article_bodies)
train_article_classes = np.array(train_article_classes)

print("Constructing TF-IDF matrix for articles.")
train_tfidf_ngram = tfidf_vect_ngram.transform(train_article_bodies)

start_time = time.time()
model = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=140).fit(train_tfidf_ngram, train_article_classes)
predicts = base_model.predict(train_tfidf_ngram)
print("Accuracy on training set %s" %round(accuracy_score(train_article_classes, predicts), 4))

Constructing TF-IDF matrix for articles.
Accuracy on training set 0.841


In [20]:
test_tfidf_ngram = tfidf_vect_ngram.transform(test_article_bodies)
test_predicts = model.predict(test_tfidf_ngram)
print(test_predicts)
np.bincount(test_predicts)

[4 0 4 ... 2 2 1]


array([ 224,  115,  716,  155, 1683])

In [22]:
prob_predicts = model.predict_proba(test_tfidf_ngram)
prob_predicts_df = pd.DataFrame(data=prob_predicts, index=test_article_ids)
prob_predicts_df.to_csv('../predictions/article_body_prediction.csv')
prob_predicts_df.head()

Unnamed: 0,0,1,2,3,4
3644,0.102493,0.043537,0.223313,0.058132,0.572524
1053,0.947547,0.00246,0.015113,0.004086,0.030794
1735,0.051891,0.021625,0.26405,0.09848,0.563954
3122,0.022754,0.010115,0.723289,0.035783,0.208059
2228,0.063622,0.033124,0.493486,0.022314,0.387455


# Late fusion
Combine the prediction from three classifiers

In [42]:
title_prob_df = pd.read_csv('../predictions/title_prediction.csv', index_col='article_id')
feature_prob_df = pd.read_csv('../predictions/features_prediction.csv', index_col='article_id')
article_body_prob_df = prob_predicts_df.copy()
print(title_prob_df.shape)
print(feature_prob_df.shape)
print(article_body_prob_df.shape)

(3826, 5)
(2893, 5)
(2893, 5)


In [40]:
matched_title_prob_df = title_prob_df.loc[feature_prob_df.index]
matched_ids = title_prob_df.index.isin(matched_title_prob_df.index)
unmatched_title_prob_df = title_prob_df[~matched_ids]
print(matched_title_prob_df.shape)
print(unmatched_title_prob_df.shape)

(2893, 5)
(933, 5)


In [33]:
## Generate multiple sets of weights for text, description, and hashtags
w_title_range = np.arange(0.0, 0.6, 0.01)
w_article_body_range = np.arange(0, 0.4, 0.01)
weight_sets = []

for w_article_body in w_article_body_range:
    for w_title in w_title_range:
        weight_sets.append([w_title, w_article_body, 1 - w_title - w_article_body])
weight_sets[:5]

[[0.0, 0.0, 1.0],
 [0.01, 0.0, 0.99],
 [0.02, 0.0, 0.98],
 [0.03, 0.0, 0.97],
 [0.04, 0.0, 0.96]]

In [41]:
## Search for optimal weights to be assigned to individual classifier
y_num = np.array([int(cls[:-1]) for cls in y])
accuracies = []
for i in range(len(weight_sets)):
    w_title, w_article_body, w_features = weight_sets[i]
    combined_prob = w_title * matched_title_prob_df.values + w_article_body * article_body_prob_df.values + w_features * feature_prob_df.values
    combined_pred = np.apply_along_axis(np.argmax, 1, combined_prob)

	avg_p = accuracy_score(y_num, combined_pred)
	avg_r = recall_score(y_num, combined_pred, average='macro')
	avg_f1 = f1_score(y_num, combined_pred, average='macro')

	precisions.append(avg_p)
	recalls.append(avg_r)
	f1_scores.append(avg_f1)


array([[0.2538004 , 0.03950063, 0.21367529, 0.02257236, 0.47045133],
       [0.07819359, 0.15902844, 0.2682253 , 0.06086434, 0.43368834],
       [0.07713429, 0.04451456, 0.24779618, 0.06003981, 0.57051516],
       ...,
       [0.0265868 , 0.02076217, 0.08952271, 0.67264193, 0.19048634],
       [0.02334936, 0.6398452 , 0.11418417, 0.03444034, 0.18818094],
       [0.8091364 , 0.01016325, 0.07695226, 0.01566792, 0.08808017]])