#### Notebook purpose
This notebook aims to do model selection for classical machine learning.

#### Load training and test X & y

In [7]:
import pandas as pd
from ast import literal_eval

In [8]:
data_path = '../../data/'
models_path = '../../models/'

In [9]:
X_train = pd.read_csv(f'{data_path}X_train.csv', converters={'event_list': literal_eval, 'person_list': literal_eval})
X_test = pd.read_csv(f'{data_path}X_test.csv', converters={'event_list': literal_eval, 'person_list': literal_eval})
y_train = pd.read_csv(f'{data_path}y_train.csv').values.ravel()
y_test = pd.read_csv(f'{data_path}y_test.csv').values.ravel()

#### Define feature extraction pipelines
The function make_features returns features extracted from the articles bodies and metadata:
- the body and the title are converted into tfidf scores
- the list of persons and events are onehot encoded (only the 1000 more frequent are kept)
- the source is onehot encode

In [10]:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
import phq_utils.utils_sklearn as utils

In [12]:
feature_pipelines = {'body': make_pipeline(utils.ColumnExtractor('body'), TfidfVectorizer(max_df=0.75, min_df=10)),
                     'title': make_pipeline(utils.ColumnExtractor('title'), TfidfVectorizer(max_df=0.75, min_df=10)),
                     'source': make_pipeline(utils.ColumnExtractor(['source', 'dummy']), # adding dummy is just a hack to avoid OneHotEncoder compalining
                                             OneHotEncoder(handle_unknown='ignore')), 
                     'person': make_pipeline(utils.ColumnExtractor('person_list'), utils.MostCommonEntity(1000)),
                     'event': make_pipeline(utils.ColumnExtractor('event_list'), utils.MostCommonEntity(1000))}

In [13]:
def make_features(feature_list):
    return FeatureUnion([(feat, feature_pipelines[feat]) for feat in feature_list]) 

#### Let's have a look at the evolution of the performance while adding more features

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
from sklearn.model_selection import cross_val_score

In [16]:
import numpy as np

In [17]:
%%time
feature_list = ['body', 'title', 'source', 'person', 'event']
for i in range(1, len(feature_list) + 1):
    n_col = make_features(feature_list[:i]).fit_transform(X_train).shape[1]
    model = Pipeline([('feature', make_features(feature_list[:i])),
                      ('filter', VarianceThreshold()),
                      ('clf', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1E3))])
    scores = cross_val_score(model, X_train, y_train.ravel(), cv=5, scoring='f1_macro')
    print(f'f1 macro score is {np.mean(scores):0.2f} +- {np.std(scores):0.3f} when using {", ".join(feature_list[:i])} (~ {n_col} individual features)')

f1 macro score is 0.88 +- 0.004 when using body (~ 7169 individual features)
f1 macro score is 0.90 +- 0.005 when using body, title (~ 10430 individual features)
f1 macro score is 0.91 +- 0.004 when using body, title, source (~ 13406 individual features)
f1 macro score is 0.92 +- 0.004 when using body, title, source, person (~ 14406 individual features)
f1 macro score is 0.92 +- 0.004 when using body, title, source, person, event (~ 15406 individual features)
CPU times: user 9min 9s, sys: 19.2 s, total: 9min 28s
Wall time: 2min 14s


#### Let's try grid search with a boosting model

In [18]:
from lightgbm import LGBMClassifier

In [19]:
model = Pipeline([('feature', make_features(['body', 'title', 'source', 'person'])),
                  ('filter', VarianceThreshold()),
                  ('clf', LGBMClassifier(objective='multiclass', random_state=42, num_threads=28))]) # love my AMD Ryzen Threadripper..

In [20]:
from sklearn.model_selection import GridSearchCV

In [32]:
space = {
    'clf__n_estimators': [100],
    'clf__learning_rate': [0.1, 0.01],
    'clf__subsample': [0.8, 0.4],
    'clf__feature_fraction': [0.8, 0.4]
    }

grid = GridSearchCV(estimator=model, param_grid=space, cv=5, scoring='f1_macro')

In [33]:
%time search = grid.fit(X_train, y_train)

CPU times: user 2h 24min 17s, sys: 2min 11s, total: 2h 26min 28s
Wall time: 12min 6s


In [35]:
argmax_str = ', '.join([str(k.split('__')[-1]) + '=' + str(v) for k, v in grid.best_params_.items()])
print(f'best f1 macro score is {np.mean(grid.best_score_):0.2f} +- {np.std(grid.best_score_):0.3f} with {argmax_str}')

best f1 macro score is 0.90 +- 0.000 with feature_fraction=0.4, learning_rate=0.1, n_estimators=100, subsample=0.8


#### Serialize best models

In [27]:
from sklearn.externals import joblib

In [36]:
best_logreg1 = Pipeline([('feature', make_features(['body', 'title', 'source', 'person'])),
                         ('filter', VarianceThreshold()),
                         ('clf', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1E3))])

best_logreg2 = Pipeline([('feature', make_features(['body', 'title', 'source', 'person', 'event'])),
                         ('filter', VarianceThreshold()),
                         ('clf', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1E3))])

best_boosting = Pipeline([('feature', make_features(['body', 'title', 'source', 'person'])),
                  ('filter', VarianceThreshold()),
                  ('clf', LGBMClassifier(objective='multiclass', random_state=42, n_estimators=100, learning_rate=0.1, feature_fraction=0.4, subsample=0.8))])

In [37]:
best_logreg1.fit(X_train, y_train)
best_logreg2.fit(X_train, y_train)
best_boosting.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('feature', FeatureUnion(n_jobs=None,
       transformer_list=[('body', Pipeline(memory=None,
     steps=[('columnextractor', <phq_utils.utils_sklearn.ColumnExtractor object at 0x7fe0851e3860>), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        ...   reg_lambda=0.0, silent=True, subsample=0.8,
        subsample_for_bin=200000, subsample_freq=0))])

In [38]:
joblib.dump(best_logreg1, f'{models_path}best_logreg1.pkl')
joblib.dump(best_logreg2, f'{models_path}best_logreg2.pkl') 
joblib.dump(best_boosting, f'{models_path}best_boosting.pkl') 

['../../models/best_boosting.pkl']