In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, fbeta_score, make_scorer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import time
from sklearn.decomposition import PCA

In [2]:
class Timer(object):
    def __init__ (self):
        self.start_time = time.time()
        self.end_time = 0.0
        self.duration = 0.0
    def stop_timer(self):
        self.end_time = time.time()
        self.duration = self.end_time - self.start_time
    def get_duration(self):
        self.stop_timer()
        return self.duration

In [3]:
train_data = pd.read_csv('data/train_processed.csv')
test_data = pd.read_csv('data/test_processed.csv')

In [4]:
X_train = train_data[['publisher','processed_titles','processed_texts', 'url_status']].copy()
y_train_final = train_data['category'].as_matrix()
X_test = test_data[['publisher','processed_titles','processed_texts', 'url_status']].copy()

In [5]:
title_vectorizer = TfidfVectorizer(strip_accents='unicode')
text_vectorizer = TfidfVectorizer(strip_accents='unicode')
label_encoder = LabelEncoder()
url_status_encoder = OneHotEncoder(handle_unknown='ignore')
publisher_encoder = OneHotEncoder(handle_unknown='ignore')

In [6]:
X_all = pd.concat([X_train.copy(),X_test.copy()])
title_vectorizer.fit(X_all['processed_titles'])
text_vectorizer.fit(X_all['processed_titles'])
url_status_encoder.fit(X_all['url_status'].values.reshape(-1, 1))
publisher_encoder.fit(label_encoder.fit_transform(X_all['publisher']).reshape(-1, 1))
X_all.head()

Unnamed: 0,publisher,processed_titles,processed_texts,url_status
0,NASDAQ,forex pound drop one month low euro,forex pound drop one month low euro nasdaq com...,200
1,Fox Business,hertz exit equip rental busi 2 5b spinoff,,404
2,Resource Investor,gold etf inflow return,,404
3,BGR,hacker call mt gox ceo liar say still control ...,,404
4,Forbes,gold climb to near 6 month high on concern abo...,gold climb to near 6 month high on concern abo...,200


In [7]:
X_train_title = title_vectorizer.transform(X_train['processed_titles']).todense()
X_train_text = text_vectorizer.transform(X_train['processed_texts'].fillna("")).todense()
X_train_publisher = label_encoder.transform(X_train['publisher'])
X_train_publisher = publisher_encoder.transform(X_train_publisher.reshape(-1, 1)).todense()
X_train_url_status = url_status_encoder.transform(X_train['url_status'].values.reshape(-1, 1)).todense()

In [8]:
X_test_title = title_vectorizer.transform(X_test['processed_titles']).todense()
X_test_text = text_vectorizer.transform(X_test['processed_texts'].fillna("")).todense()
X_test_publisher = label_encoder.transform(X_test['publisher'])
X_test_publisher = publisher_encoder.transform(X_test_publisher.reshape(-1, 1)).todense()
X_test_url_status = url_status_encoder.transform(X_test['url_status'].values.reshape(-1, 1)).todense()

In [12]:
X_train_final = np.concatenate([X_train_publisher, X_train_title, X_train_text, X_train_url_status], axis=1)
X_test_final = np.concatenate([X_test_publisher, X_test_title, X_test_text, X_test_url_status], axis=1)

In [13]:
feature_selector = SelectKBest(score_func=chi2,k=5000)
X_train_final = feature_selector.fit_transform(X_train_final,y_train_final)
X_test_final = feature_selector.transform(X_test_final)

In [14]:
#pca = PCA(n_components=5000)
#pca.fit(np.concatenate([X_train_final,X_test_final], axis=0))
#X_train_final = pca.transform(X_train_final)
#X_test_final = pca.transform(X_test_final)

In [15]:
X_train, X_cv, y_train, y_cv = train_test_split(X_train_final, y_train_final)

In [None]:
timer = Timer()
model = XGBClassifier(max_depth=20, silent=True, objective='multi:softmax', num_class=5, learning_rate=0.1, n_estimators=500)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_cv, y_cv)], verbose=True, early_stopping_rounds=20)
print ("Training took {:.2f} seconds". format(timer.get_duration()))

[0]	validation_0-merror:0.238717	validation_1-merror:0.394824
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.230531	validation_1-merror:0.380889
[2]	validation_0-merror:0.220575	validation_1-merror:0.378898
[3]	validation_0-merror:0.213938	validation_1-merror:0.370272
[4]	validation_0-merror:0.213496	validation_1-merror:0.370272
[5]	validation_0-merror:0.205752	validation_1-merror:0.3643
[6]	validation_0-merror:0.199115	validation_1-merror:0.366954
[7]	validation_0-merror:0.190708	validation_1-merror:0.360319
[8]	validation_0-merror:0.182965	validation_1-merror:0.357664
[9]	validation_0-merror:0.18031	validation_1-merror:0.35501
[10]	validation_0-merror:0.171018	validation_1-merror:0.347711
[11]	validation_0-merror:0.162389	validation_1-merror:0.353683
[12]	validation_0-merror:0.159513	validation_1-merror:0.353683
[13]	validation_0-merror:0.15486

In [None]:
y_pred = model.predict(X_cv)
accuracy = accuracy_score(y_cv, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
y_pred = model.predict(X_test_final)
submission = pd.DataFrame(test_data['article_id'])
submission['category'] = y_pred
submission.to_csv('results/shrey_xgboost.csv',index=False)