In [None]:
import pandas as pd
import numpy as np
import re
import os
import pickle

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import keras.backend as K
from keras.models import Model, Sequential, load_model
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.core import Dense, Flatten
from keras.layers import Dropout
from keras.layers import Input
from keras.utils import to_categorical
from keras.layers import concatenate
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import label_binarize, OneHotEncoder
# onehot_encoder = OneHotEncoder(sparse=False)

import tensorflow as tf

In [None]:
folder = ""

train_data = pd.read_csv(folder+'train_data.csv',
                            sep='\t',
                        encoding='utf-8',
                        index_col=0)

test_data = pd.read_csv(folder+'test_data.csv',
                            sep='\t',
                        encoding='utf-8',
                        index_col=0)

In [None]:
num_bias_catergories=5
num_hyperp_catergories=2

bias_classes = train_data.bias.unique()
hyperp_classes = train_data.hyperpartisan.unique()

In [None]:
train_data.head()

In [None]:
# Train and Test data
trainX = train_data.textbody
trainBiasY = label_binarize(train_data.bias, bias_classes) # one hot encoding
trainHyperpY = label_binarize(train_data.hyperpartisan, hyperp_classes)
trainHyperpY = np.hstack((trainHyperpY, 1 - trainHyperpY)) # convert to one hot encoding

testX = test_data.textbody
testBiasY = label_binarize(test_data.bias, bias_classes)
testHyperpY = label_binarize(test_data.hyperpartisan, hyperp_classes)
testHyperpY = np.hstack((testHyperpY, 1 - testHyperpY)) # convert to one hot encoding

# trainXtitle = train_data.title
# testXtitle = test_data.title

In [None]:
# Build text vocabulary
allX = pd.concat([trainX, testX])

maxLength=400
max_vocab_size = 500000
input_tokenizer = Tokenizer(max_vocab_size)
input_tokenizer.fit_on_texts(allX)
input_vocab_size = len(input_tokenizer.word_index) + 1
print("input_vocab_size:",input_vocab_size)

word_index = input_tokenizer.word_index

trainX_tokens = np.array(pad_sequences(input_tokenizer.texts_to_sequences(trainX), 
                                maxlen=maxLength, padding='post', truncating='post'))
testX_tokens = np.array(pad_sequences(input_tokenizer.texts_to_sequences(testX), 
                                maxlen=maxLength, padding='post', truncating='post'))

In [None]:
len(trainX_tokens)

In [None]:
# Save train tokens
[trainX_tokens_0, trainX_tokens_1] = np.array_split(trainX_tokens, 2)

pickle.dump( trainX_tokens_0, open( "trainX_tokens_0.p", "wb" ) )
pickle.dump( trainX_tokens_1, open( "trainX_tokens_1.p", "wb" ) )

In [None]:
# Save test tokens
pickle.dump( testX_tokens, open( "testX_tokens.p", "wb" ) )

In [None]:
# Load tokens
trainX_tokens_0 = pickle.load( open( "trainX_tokens_0.p", "rb" ) )
trainX_tokens_1 = pickle.load( open( "trainX_tokens_1.p", "rb" ) )

trainX_tokens = np.concatenate((trainX_tokens_0, trainX_tokens_1), axis=0)

testX_tokens = pickle.load( open( "testX_tokens.p", "rb" ) )

maxLength=400
max_vocab_size = 500000
input_vocab_size = 906855

In [None]:
# multi-task classification (multi-output, multi-loss)

embedding_dim = 100  # working values:50 or 100

# main input
main_input = Input(shape=(maxLength, ), name='main_input')
# x = Embedding(input_vocab_size, embedding_dim, input_length = maxLength)(main_input)
x = Embedding(906855, embedding_dim, input_length = maxLength)(main_input)

# x = Embedding(len(word_index) + 1,
#                             embedding_dim,
#                             weights=[embedding_matrix],
#                             input_length=maxLength,
#                             trainable=False)(main_input)


# title input
# title_input = Input(shape=(title_maxLength, ), name='title_input')
# title_embedding = Embedding(title_input_vocab_size, embedding_dim, input_length = title_maxLength)(title_input)


# title_input = Input(shape=(100, ), name='title_input')
# main_input = Input(shape=(100, ), name='main_input')

# merge inputs
# x = concatenate([x, title_embedding])
# x = concatenate([main_input, title_input])
# x = main_input

#### shared layers

# Simple NN:
# x = Flatten()(x)
# x = Dense(input_dim = 100, units = 100, activation = 'relu')(x)
# x = Dropout(0.9)(x)
# x = Dense(units = 50, activation = 'relu')(x)
# x = Dropout(0.9)(x)
# x = Dense(units = 30, activation = 'relu')(x)
# x = Dropout(0.8)(x)
# x = Dense(units = 100, activation = 'relu')(x)
# x = Dropout(0.6)(x)

# Recurrent Units:
# x = GRU(64, dropout=0.8, recurrent_dropout=0.3, return_sequences=True, input_shape=(embedding_dim,))(x)
# x = Dropout(0.7)(x)
x = GRU(32, dropout=0.9, recurrent_dropout=0.3, return_sequences=True, input_shape=(embedding_dim,))(x)
x = Dropout(0.6)(x)
x = GRU(32, dropout=0.8, recurrent_dropout=0.3, return_sequences=True, input_shape=(embedding_dim,))(x)
# x = GRU(128, dropout=0.9)(x)

x = Flatten()(x)


#### output layers 
# bias layer
bias_output = Dense(num_bias_catergories, activation='softmax', name="bias_output")(x)
# bias_output = Dense(num_bias_catergories, activation='relu', name="bias_output")(x)

# hyperp layer
hyperp_output = Dense(num_hyperp_catergories, activation='softmax', name="hyperp_output")(x)
# hyperp_output = Dense(num_hyperp_catergories, activation='relu', name="hyperp_output")(x)


model = Model(#inputs = [main_input, title_input], 
                inputs = main_input, 
#               outputs = [bias_output, hyperp_output],
                outputs = hyperp_output,
              name = "bias_classifier")


# define losses
losses = {
    "bias_output": "categorical_crossentropy",
    "hyperp_output": "categorical_crossentropy",
}
lossWeights = {"bias_output": 1.0, "hyperp_output": 1.0}

                
# model.compile(loss=losses, loss_weights=lossWeights, optimizer='adam', metrics=['accuracy'])
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

In [None]:
# fit multi-task model
history = model.fit(#{'main_input': trainX_tokens, 'title_input': trainXtitle_tokens},
                    {'main_input': trainX_tokens},
                    {"bias_output": trainBiasY, "hyperp_output": trainHyperpY},
                    validation_data=(
#                         {'main_input': testX_tokens, 'title_input': testXtitle_tokens},
                        {'main_input': testX_tokens},
                         {"bias_output": testBiasY, "hyperp_output": testHyperpY}),
                    batch_size=512, 
                    epochs=1, 
                    shuffle = True,
                    verbose=1)

In [None]:
model.save("model_XXXX")

In [None]:
pickle.dump( history, open( "history-model_XXXX.p", "wb" ) )

In [None]:
# load model
# model = load_model('model_XXXX')

In [None]:
# Predict labels
predictHyperOutputs = model.predict({'main_input': testX_tokens}, batch_size=8192, verbose=1)

# pred_classes_bias = np.argmax(predictBiasOutputs, axis=1)
pred_classes_hyperp = np.argmax(predictHyperOutputs, axis=1)

In [None]:
# Convert Hyperparameter prediction to one-hot encoding
x = label_binarize(pred_classes_hyperp, [0,1])
x = np.hstack((1-x, x))
pred_classes_hyperp = x

In [None]:
# Generate performance measures for prediction
print("\nHyperp performance:")
print(classification_report(testHyperpY, pred_classes_hyperp))
print("Macro F1", f1_score(testHyperpY, pred_classes_hyperp, average='macro'))