In [1]:
import os
import numpy as np
import pandas as pd
import time
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (classification_report, accuracy_score,
                             precision_score, recall_score, f1_score)

# Prepare article bodies for training and test

In [3]:
print("Loading article bodies for training...")
train_articles_dir = '../output'
# test_articles_dir = '../output_test'
train_articles_df = pd.read_csv('input/train_v2.csv', index_col='article_id')
train_article_ids = []
train_article_bodies = []
train_article_classes = []
for article in os.listdir(train_articles_dir):
    article_id = article.split('.')[0]
    if article_id.isdigit():
        with open(os.path.join(train_articles_dir, article), 'r') as f:
            paragraphs = f.readlines()
        f.close()
        train_article_ids.append(article_id)
        train_article_bodies.append(' '.join(paragraphs))
        train_article_classes.append(train_articles_df.loc[int(article_id), 'category'])

Loading article bodies for training...


In [4]:
print("Constructing TF-IDF matrix for articles.")
x = TfidfVectorizer().fit_transform(train_article_bodies)
y = np.array(train_article_classes)
print("Dimension of TF-IDF matrix: %s", x.shape)

Constructing TF-IDF matrix for articles.
Dimension of TF-IDF matrix: %s (4678, 27160)


# Build baseline model

In [5]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2)
print("Constructing TF-IDF matrix for articles.")
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), max_features=3500)
tfidf_vect_ngram.fit(train_articles_df["title"])
train_tfidf_ngram = tfidf_vect_ngram.transform(x_train)
val_tfidf_ngram = tfidf_vect_ngram.transform(x_val)
base_model = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=140).fit(train_tfidf_ngram, y_train)
predicts = base_model.predict(train_tfidf_ngram)
print("Accuracy on training set %s" %round(accuracy_score(y_train, predicts), 4))
predicts = base_model.predict(val_tfidf_ngram)
print("Accuracy on validation set %s" %round(accuracy_score(y_val, predicts), 4))

Constructing TF-IDF matrix for articles.


AttributeError: lower not found