In [1]:
import pandas as pd

In [2]:
from ast import literal_eval

In [3]:
sports_df = pd.read_csv('../data/sports_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 2)
politics_df = pd.read_csv('../data/politics_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 1)
other_df = pd.read_csv('../data/other_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 0)

In [4]:
intersection = sports_df.merge(politics_df, on='id')[['id']].assign(intersection=lambda x: 1)

In [5]:
df = pd.concat([sports_df.merge(intersection, how='left').query('intersection != intersection').drop('intersection', axis=1),
                politics_df.merge(intersection, how='left').query('intersection != intersection').drop('intersection', axis=1),
                other_df],
               axis=0)[['title', 'body', 'source', 'event_list', 'person_list', 'y']].assign(dummy=lambda x: 1)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df.body = df.body.map(lambda x: x[:100])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis=1), 
                                                    df.y, 
                                                    test_size=0.33, 
                                                    random_state=42, 
                                                    shuffle=True, stratify=df.y)

In [9]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import VarianceThreshold

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
import utils_sklearn as utils

In [12]:
feature_pipelines = {'body': make_pipeline(utils.ColumnExtractor('body'), TfidfVectorizer(max_df=0.75, min_df=10)),
                     'title': make_pipeline(utils.ColumnExtractor('title'), TfidfVectorizer(max_df=0.75, min_df=10)),
                     'source': make_pipeline(utils.ColumnExtractor(['source', 'dummy']), OneHotEncoder(handle_unknown='ignore')),
                     'person': make_pipeline(utils.ColumnExtractor('person_list'), utils.MostCommonEntity()),
                     'event': make_pipeline(utils.ColumnExtractor('event_list'), utils.MostCommonEntity())}

In [13]:
def make_features(feature_list):
    return FeatureUnion([(feat, feature_pipelines[feat]) for feat in feature_list]) 

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
from sklearn.model_selection import cross_val_score

In [16]:
import numpy as np

In [18]:
%%time
feature_list = ['body', 'title', 'source', 'person', 'event']
for i in range(1, len(feature_list) + 1):
    n_col = make_features(feature_list[:i]).fit_transform(X_train).shape[1]
    model = Pipeline([('feature', make_features(feature_list[:i])),
                      ('filter', VarianceThreshold()),
                      ('clf', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1E3))])
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
    print(f'f1 macro score is {np.mean(scores):0.2f} +- {np.std(scores):0.3f} when using {feature_list[:i]} (~ {n_col} individual features)')

f1 macro score is 0.79 +- 0.002 when using ['body'] (~ 3950 individual features)
f1 macro score is 0.86 +- 0.003 when using ['body', 'title'] (~ 7211 individual features)
f1 macro score is 0.89 +- 0.003 when using ['body', 'title', 'source'] (~ 10187 individual features)
f1 macro score is 0.90 +- 0.002 when using ['body', 'title', 'source', 'person'] (~ 10687 individual features)
f1 macro score is 0.90 +- 0.003 when using ['body', 'title', 'source', 'person', 'event'] (~ 11187 individual features)
CPU times: user 5min 18s, sys: 24.4 s, total: 5min 42s
Wall time: 1min 24s


In [18]:
from lightgbm import LGBMClassifier

In [37]:
model = Pipeline([('feature', make_features(['body', 'title', 'source', 'person'])),
                  ('filter', VarianceThreshold()),
                  ('clf', LGBMClassifier(objective='multiclass', random_state=42))])

In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
space = {
    'clf__n_estimators': [100, 200, 300],
    'clf__learning_rate': [0.1, 0.05, 0.01],
    'clf__subsample': [1.0, 0.9, 0.8]
    }

grid = GridSearchCV(estimator=model, param_grid=space, cv=5, scoring='f1_macro')

In [41]:
%time search = grid.fit(X_train, y_train)

CPU times: user 3h 58min 21s, sys: 54.3 s, total: 3h 59min 15s
Wall time: 45min 29s


In [51]:
print(f'best f1 macro score is {np.mean(grid.best_score_):0.2f} +- {np.std(grid.best_score_):0.3f} [{grid.best_params_}]')

best f1 macro score is 0.88 +- 0.000 [{'clf__learning_rate': 0.1, 'clf__n_estimators': 300, 'clf__subsample': 1.0}]
