Steps:
1. Runtime -> Change Runtime -> Hardware Accelerator -> GPU

In [None]:
# To get some stats about the GPU
!nvidia-smi
!nvcc --version

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
folder = '/content/drive/My Drive/ire-major-project/' # use in python code

# Functions for converting text to vectors

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
class ConvertText2Vec():
    def __init__(self, max_nb_words, max_sequence_length, embedding_length,
            df_data_column_values):
        """df_data_column_values is typically `df_data['article_text'].values`"""
        self.max_nb_words = max_nb_words
        self.max_sequence_length = max_sequence_length
        self.embedding_length = embedding_length
        self.tokenizer = Tokenizer(
            num_words=max_nb_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
        self.tokenizer.fit_on_texts(df_data_column_values)
        print('Found %s unique tokens.' % len(self.tokenizer.word_index))
    
    def convert_to_vector(self, df_data_column_values):
        """df_column_values is of the form `df_data['article_text'].values`"""
        x = self.tokenizer.texts_to_sequences(df_data_column_values)
        word_index = self.tokenizer.word_index
        x = pad_sequences(x, maxlen=self.max_sequence_length)
        print('Shape of data tensor:', x.shape)
        return x, word_index

# Preprocessing
Use these filenames to change the dataset for data
    # [this is smallest] articles-training-byarticle-20181122.xml - 3mb
    # articles-validation-bypublisher-20181122.xml - 894mb
    # articles-training-bypublisher-20181122.xml - 3gb
Use these filenames to change the dataset for truth
    # [this is smallest] ground-truth-training-byarticle-20181122.xml - 109kb
    # ground-truth-validation-bypublisher-20181122.xml - 24mb
    # ground-truth-training-bypublisher-20181122.xml - 100mb

Run the following cell **ONLY ONCE** to save all the parsed dataset files in a csv format in your drive (directly loading from raw files takes lot of time plus a lot of extra memory also due to some reason).

### 1. Load (Training and Testing) data ~ 36 secs

In [None]:
df_data = pd.read_csv(folder+'data_training_bypublisher.csv')
df_truth = pd.read_csv(folder+'ground_truth_training_bypublisher.csv')
df_data.info(memory_usage='deep')
df_truth.info(memory_usage='deep')

Loading Testing data separately (if we are not splitting it out of training data) ~ 10 secs

In [None]:
df_val_data = pd.read_csv(folder+'data_validation_bypublisher.csv')
df_val_truth = pd.read_csv(folder+'ground_truth_validation_bypublisher.csv')
df_val_data.info(memory_usage='deep')
df_val_truth.info(memory_usage='deep')

### 2. Set the Parameters
Do some analysis and set them accordingly.

In [None]:
# # The maximum number of words to be used. (most frequent)
# MAX_NB_WORDS = 50000
# # Max number of words in each complaint.
# MAX_SEQUENCE_LENGTH = 500
# # This is fixed.
# EMBEDDING_DIM = 100

MAX_NB_WORDS = 50000  # dictionary size
MAX_SEQUENCE_LENGTH = 600  # max word length of each individual article
EMBEDDING_DIM = 300  # dimensionality of the embedding vector (50, 100, 200, 300)

### 2. Coverting Documents to vectors
- Total time ~ 15 mins

In [None]:
start_time = time()

text_to_vec_converter = ConvertText2Vec(
    MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM,
    df_data['article_text'].values
)


# Convert training and testing data to vectors of fixed length (Use one of the 2 cases below)

# Case 1) Test data is split from training data only
#############################################################################################
# x = text_to_vec_converter.convert_to_vector(df_data['article_text'].values)
# y_tmp = np.array(df_truth['hyperpartisan'].values)
# y = np.array([1 if x == 'true' else 0 for x in y_tmp])
# x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.10, random_state = 42)


# Case 2) Test data is separately loaded
#############################################################################################
# x_train, word_index_train = text_to_vec_converter.convert_to_vector(df_data['article_text'].values)
# x_val, word_index_val = text_to_vec_converter.convert_to_vector(df_val_data['article_text'].values)
# x_test, word_index_test = text_to_vec_converter.convert_to_vector(df_test_data['article_text'].values)

print("converting to vectors took",time()-start_time,"to complete")

In [None]:
x_train, word_index_train = text_to_vec_converter.convert_to_vector(df_data['article_text'].values)

In [None]:
x_val, word_index_val = text_to_vec_converter.convert_to_vector(df_val_data['article_text'].values)
x_test, word_index_test = text_to_vec_converter.convert_to_vector(df_test_data['article_text'].values)

In [None]:
y_tmp = np.array(df_truth['hyperpartisan'].values)
y_train = np.array([1 if x == True else 0 for x in y_tmp])
y_bias_kind = df_train_truth.bias.values

y_tmp = np.array(df_val_truth['hyperpartisan'].values)
y_val = np.array([1 if x == True else 0 for x in y_tmp])
y_val_bias_kind = df_val_truth.bias.values

y_tmp = np.array(df_test_truth['hyperpartisan'].values)
y_test = np.array([1 if x == 'true' else 0 for x in y_tmp])
y_test_bias_kind = df_test_truth.bias.values

NUM_CLASSES_BIAS = len(np.unique(y_bias))
NUM_CLASSES_BIAS_KIND = len(np.unique(y_bias_kind))

In [None]:
print(y_train[y_train == 1].shape)
print(y_train[y_train == 0].shape)
print(y_val[y_val == 1].shape)
print(y_val[y_val == 0].shape)
print(y_test[y_test == 1].shape)
print(y_test[y_test == 0].shape)

In [None]:
from keras.utils import to_categorical
y_bias = to_categorical(y_bias, num_classes=NUM_CLASSES_BIAS)
y_test_bias = to_categorical(y_test_bias, num_classes=NUM_CLASSES_BIAS)

In [None]:
labelEncoder = LabelEncoder()
labelEncoder.fit(np.unique(y_bias_kind))
# labelEncoder.classes_
y_bias_kind=labelEncoder.transform(y_bias_kind)
y_test_bias_kind=labelEncoder.transform(y_test_bias_kind)

In [None]:
embeddings_index = {}
f = open(folder + 'glove.6B.300d.txt', 'r', encoding='utf8')
for line in f:
    # each line starts with a word; rest of the line is the vector
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print(f'Found {len(embeddings_index)} word vectors in glove file.')

In [None]:
embedding_matrix = np.zeros((len(word_index_train) + 1, EMBEDDING_DIM))
for word, i in word_index_train.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print("embedding_matrix shape:", np.shape(embedding_matrix))

In [None]:
from keras.layers import Embedding
loaded_embeddings = Embedding(len(word_index_train) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

# Model Definition
Link to original implementation of CuDNNLSTM - https://github.com/ShawnyXiao/TextClassification-Keras/tree/master/model/TextRNN.

It is based on the paper "Recurrent Neural Network for Text Classification with Multi-Task Learning" (https://arxiv.org/pdf/1605.05101.pdf)

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Dense
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

class TextRNN(Model):
    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        super(TextRNN, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.rnn = CuDNNLSTM(128)  # LSTM or GRU
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of TextRNN must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of TextRNN must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        embedding = self.embedding(inputs)
        x = self.rnn(embedding)
        output = self.classifier(x)
        return output

# Building and Training the Model

Build (define) the model.

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout
from keras import Sequential, Model, Input

max_features = MAX_NB_WORDS
maxlen = MAX_SEQUENCE_LENGTH
embedding_dims = EMBEDDING_DIM
batch_size = 500
epochs = 5

print('Build model...')
# Model (1) Using CuDNN based approach
# -----------------------------------------------------------
# model = TextRNN(maxlen, max_features, embedding_dims)
# model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])



# Model (2) Using plain LSTM based approach
# -----------------------------------------------------------
# model = Sequential()
# model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
# model.add(SpatialDropout1D(0.2))
# model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model (3)
# input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

# embedding_layer = loaded_embeddings(input_layer)
# embedding_layer = Dropout(0.5)(embedding_layer)

# hidden_layer = LSTM(64, recurrent_dropout=0.5)(embedding_layer)
# hidden_layer = Dropout(0.5)(hidden_layer)

# output_layer = Dense(1, activation='sigmoid')(hidden_layer)

# model = Model(input_layer, output_layer)
# model.compile(loss='binary_crossentropy',
#               optimizer='adamax',
#               metrics=['accuracy'])
# print(model.summary())

# Model (4)
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')

embedding_layer = loaded_embeddings(input_layer)
embedding_layer = Dropout(0.5)(embedding_layer)

hidden_layer = LSTM(64, recurrent_dropout=0.5)(embedding_layer)
hidden_layer = Dropout(0.5)(hidden_layer)

# Task 1
output_bias = Dense(2, activation='softmax')(hidden_layer)

# Task 2
output_bias_kind = Dense(5, activation='softmax')(hidden_layer)


model = Model(input_layer, [output_bias, output_bias_kind])

model.compile(loss='categorical_crossentropy', 
              optimizer='adamax', 
              metrics=['acc'])

# print(model.summary())

Training step

In [None]:
print('Train...')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=2, mode='max')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_val, y_val))



Testing

In [None]:
print('Test...')
start_time = time()
result = model.predict(x_test)
print("took",time()-start_time,"to complete")

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

In [None]:
# <dataset_name>_<model_type>_<MAX_NB_WORDS/max_features>_<MAX_SEQUENCE_LENGTH/maxlen>_<EMBEDDING_DIM/embedding_dims>
# model.save_weights(folder+'model/bypublisher_training_CuDNNLSTM_50000_250_100')
# model.save_weights(folder+'model/bypublishervalidation_CuDNNLSTM_50000_600_100',save_format='tf')
model.save_weights(folder+'model/CuDNN/5000_350_100/bypublisher_training_CuDNNLSTM_5000_250_100')

# Loading back a saved model

In [None]:
# loaded_model.load_weights(folder+'model/bypublishervalidation_CuDNNLSTM_50000_600_100.data-00000-of-00001')

# loaded_model = TextRNN(maxlen, max_features, embedding_dims)
# loaded_model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
loaded_model = Sequential()
loaded_model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
loaded_model.add(SpatialDropout1D(0.2))
loaded_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
loaded_model.add(Dense(1, activation='sigmoid'))
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
loaded_model.load_weights(folder+'model/CuDNN/5000_350_100/bypublisher_training_CuDNNLSTM_5000_250_100')

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

In [None]:
y_pred_bias, y_pred_bias_kind = model.predict(X_test)

In [None]:
print(classification_report(np.argmax(y_test_bias, axis=1),
                            np.argmax(y_pred_bias, axis=1),
                            target_names=['unbiased','biased']))

print(classification_report(np.argmax(y_test_bias_kind, axis=1),
                            np.argmax(y_pred_bias_kind, axis=1),
                            target_names=labelEncoder.inverse_transform(reverse_to_categorical(y_train_bias_kind))))

In [None]:
model.save('lstm_multitask.h5')