In [293]:
import pandas as pd

In [294]:
from ast import literal_eval

In [295]:
sports_df = pd.read_csv('../data/sports_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 2)
politics_df = pd.read_csv('../data/politics_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 1)
other_df = pd.read_csv('../data/other_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 0)

In [296]:
intersection = sports_df.merge(politics_df, on='id')[['id']].assign(intersection=lambda x: 1)

In [297]:
df = pd.concat([sports_df.merge(intersection, how='left').query('intersection != intersection').drop('intersection', axis=1),
                politics_df.merge(intersection, how='left').query('intersection != intersection').drop('intersection', axis=1),
                other_df],
               axis=0)[['title', 'body', 'source', 'event_list', 'person_list', 'y']]

In [298]:
from sklearn.preprocessing import MultiLabelBinarizer

In [299]:
from sklearn.model_selection import train_test_split

In [300]:
df.body = df.body.map(lambda x: x[:100])

In [301]:
X_train, X_test, y_train, y_test = train_test_split(df[['body', 'title', 'source', 'event_list', 'person_list']], df.y, test_size=0.33, random_state=42, shuffle=True, stratify=df.y)

In [302]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, LabelBinarizer

In [303]:
from sklearn.base import clone

In [304]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [305]:
text_pipe = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer())])

In [306]:
from sklearn.decomposition import LatentDirichletAllocation

In [307]:
pipeline = FeatureUnion([
    ('body_pipeline', Pipeline([('extract', FunctionTransformer(lambda X: X.body, validate=False)), 
                                ('process', clone(text_pipe))])),
    ('title_pipeline', Pipeline([('extract', FunctionTransformer(lambda X: X.title.fillna('missing'), validate=False)), 
                                ('process', clone(text_pipe))])),
    ('source_pipeline', Pipeline([('extract', FunctionTransformer(lambda X: X.source.fillna('missing'), validate=False)), 
                                ('process', MyLabelBinarizer())])),
    ('event_pipeline', Pipeline([('extract', FunctionTransformer(lambda X: X.event_list, validate=False)), 
                                ('process', MyMultiLabelBinarizer())])),
    ('person_pipeline', Pipeline([('extract', FunctionTransformer(lambda X: X.person_list, validate=False)), 
                                ('process', MyMultiLabelBinarizer())]))
])

In [308]:
from sklearn.linear_model import LogisticRegression

In [309]:
from sklearn.base import TransformerMixin #gives fit_transform method for free
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)
    
class MyMultiLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = MultiLabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

In [310]:
baseline = Pipeline([
    ('processing', pipeline),
    ('clf', LogisticRegression())])

In [311]:
baseline.fit(X_train, y_train)  



Pipeline(memory=None,
     steps=[('processing', FeatureUnion(n_jobs=None,
       transformer_list=[('body_pipeline', Pipeline(memory=None,
     steps=[('extract', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function <lambda> at 0x1ab264d9d8>, inv_kw_args=None,
          inverse_func=None, kw_...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [312]:
y_hat = baseline.predict(X_test) 

  .format(sorted(unknown, key=str)))
  .format(sorted(unknown, key=str)))


In [313]:
import numpy as np

In [80]:
from sklearn.metrics import classification_report

In [81]:
from sklearn.model_selection import cross_val_score

In [84]:
cross_val_score(baseline, X_train, y_train, cv=5, scoring='f1_macro')



array([0.88582613, 0.88503052, 0.88547125, 0.88895849, 0.88228579])

In [314]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.85      0.95      0.89      3300
           1       0.97      0.92      0.95      3299
           2       0.96      0.90      0.93      3298

   micro avg       0.92      0.92      0.92      9897
   macro avg       0.93      0.92      0.92      9897
weighted avg       0.93      0.92      0.92      9897



In [288]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88      3300
           1       0.92      0.93      0.92      3299
           2       0.94      0.90      0.92      3298

   micro avg       0.91      0.91      0.91      9897
   macro avg       0.91      0.91      0.91      9897
weighted avg       0.91      0.91      0.91      9897



In [230]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.87      0.88      0.87      3300
           1       0.92      0.93      0.92      3299
           2       0.92      0.91      0.91      3298

   micro avg       0.90      0.90      0.90      9897
   macro avg       0.90      0.90      0.90      9897
weighted avg       0.90      0.90      0.90      9897



In [83]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85      3300
           1       0.91      0.90      0.91      3298
           2       0.92      0.89      0.90      3298

   micro avg       0.89      0.89      0.89      9896
   macro avg       0.89      0.89      0.89      9896
weighted avg       0.89      0.89      0.89      9896

