In [1]:
import pandas as pd

In [2]:
from ast import literal_eval

In [3]:
sports_df = pd.read_csv('../data/sports_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 2)
politics_df = pd.read_csv('../data/politics_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 1)
other_df = pd.read_csv('../data/other_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 0)

In [4]:
intersection = sports_df.merge(politics_df, on='id')[['id']].assign(intersection=lambda x: 1)

In [5]:
df = pd.concat([sports_df.merge(intersection, how='left').query('intersection != intersection').drop('intersection', axis=1),
                politics_df.merge(intersection, how='left').query('intersection != intersection').drop('intersection', axis=1),
                other_df],
               axis=0)[['title', 'body', 'source', 'y']].assign(dummy=lambda x: 1)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df.body = df.body.map(lambda x: x[:100])
df = df.fillna('missing')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis=1), 
                                                    df.y, 
                                                    test_size=0.33, 
                                                    random_state=42, 
                                                    shuffle=True, stratify=df.y)

In [9]:
from keras.preprocessing.text import Tokenizer

t_body = Tokenizer(num_words=1E4, lower=False, oov_token='OOV')
t_title = Tokenizer(num_words=1E4, lower=False, oov_token='OOV')
t_source = Tokenizer(num_words=1E4, lower=False, oov_token='OOV')

t_body.fit_on_texts(X_train.body)
t_title.fit_on_texts(X_train.title.fillna('missing'))
t_source.fit_on_texts(X_train.title.fillna('missing'))

enc_body = t_body.texts_to_sequences(X_train.body)
enc_title = t_title.texts_to_sequences(X_train.title.fillna('missing'))
enc_source = t_source.texts_to_sequences(X_train.source)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
import numpy as np

In [11]:
max_len_body = np.max([len(seq) for seq in enc_body])
max_len_title = np.max([len(seq) for seq in enc_title])
max_len_source = np.max([len(seq) for seq in enc_source])

In [12]:
from keras.preprocessing.sequence import pad_sequences

In [13]:
dat_body = pad_sequences(enc_body, maxlen=max_len_body)
dat_title = pad_sequences(enc_title, maxlen=max_len_title)
dat_source = pad_sequences(enc_source, maxlen=max_len_source)

In [14]:
from keras.models import Model
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Input, Dropout, BatchNormalization
from keras.layers import concatenate

In [51]:
embedding_dim = 300

body = Input(shape=(max_len_body, )) 
title = Input(shape=(max_len_title, )) 
source = Input(shape=(max_len_source, ))

body_emb = Embedding(int(t_body.num_words), embedding_dim, input_length=max_len_body)(body)
title_emb = Embedding(int(t_title.num_words), embedding_dim, input_length=max_len_title)(title)
source_emb = Embedding(int(t_source.num_words), embedding_dim, input_length=max_len_source)(source)

cnn_body = Conv1D(150, 3, activation='relu')(body_emb)
pool_body = GlobalMaxPooling1D()(cnn_body)

cnn_title = Conv1D(150, 3, activation='relu')(title_emb)
pool_title = GlobalMaxPooling1D()(cnn_title)

cnn_source = Conv1D(150, 3, activation='relu')(source_emb)
pool_source = GlobalMaxPooling1D()(cnn_source)

concat = concatenate([pool_body, pool_title, pool_source])

out = Dense(10, activation='relu')(concat)
out = Dropout(0.5)(out)
out = Dense(3, activation='softmax')(out)

model = Model(inputs=[body, title, source], outputs=out)

In [52]:
from keras.optimizers import Adam

In [57]:
model.compile(optimizer=Adam(lr=0.0001), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

In [58]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [59]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('../models/keras_early_stopping.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

In [60]:
model.fit([dat_body, dat_title, dat_source], y_train, 
          batch_size=10, epochs=50, validation_split=0.1,
          verbose=0, callbacks=[es, mc])


Epoch 00001: val_loss improved from inf to 0.49449, saving model to ../models/keras_early_stopping.h5

Epoch 00002: val_loss did not improve from 0.49449

Epoch 00003: val_loss did not improve from 0.49449

Epoch 00004: val_loss did not improve from 0.49449

Epoch 00005: val_loss did not improve from 0.49449

Epoch 00006: val_loss did not improve from 0.49449
Epoch 00006: early stopping


<keras.callbacks.History at 0x1a55c9c668>

In [61]:
enc_body_test = t_body.texts_to_sequences(X_test.body)
enc_title_test = t_title.texts_to_sequences(X_test.title)
enc_source_test = t_source.texts_to_sequences(X_test.source)

In [62]:
dat_body_test = pad_sequences(enc_body_test, maxlen=max_len_body)
dat_title_test = pad_sequences(enc_title_test, maxlen=max_len_title)
dat_source_test = pad_sequences(enc_source_test, maxlen=max_len_source)

In [63]:
y_hat = model.predict([dat_body_test, dat_title_test, dat_source_test]).argmax(axis=1)

In [64]:
from sklearn.metrics import f1_score

In [65]:
f1_score(y_test, y_hat, average='macro')

0.8889712727673142