In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

Mounted at /content/gdrive


In [80]:
#training params
batch_size = 256 
num_epochs = 100

#model parameters
num_filters = 256
embed_dim = 300 
weight_decay = 1e-4
network_type = 'LSTM' #'CNN'

m = tf.keras.metrics.AUC(
    num_thresholds=200,
    curve="ROC",
    summation_method="interpolation",
)

In [64]:
import pandas as pd
csv_path = 'path'
df = pd.read_csv(csv_path)
train_texts = list(df.texts.values)
train_labels = list(df.labels.values)

In [60]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

import re
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = text.replace("\\", " ").replace(u"╚", " ").replace(u"╩", " ").replace("«", ' ').replace("»", ' ').replace("…", ' ')
    text = text.lower()
    text = re.sub('\-\s\r\n\s{1,}|\-\s\r\n|\r\n', ' ', text) 
    text = re.sub('[.,:;<>_%©?–*,!@#$%^&()\d]|[+=]|[[]|[]]|[/]|"|\s{2,}|-', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = " ".join(lemmatizer.lemmatize(word) for word in text.split())
    '''
    new_text = []
    for word in text.split():
      if word not in stops:
        new_text.append(word)
    
    return ' '.join(new_text)
    '''
    return text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [66]:
train_texts = [clean_text(text) for text in train_texts]
train_texts[:5]

['low income countries will continue to be the most affected by human induced climate change over the next century they will experience gradual sea level rises stronger cyclones warmer days and nights more unpredictable rainfall and larger and longer heatwaves according to a recent report the last major united nations un assessment in predicted temperature rises of c or more by the end of the century that is now thought unlikely by scientists but average land and sea temperatures are expected to continue rising throughout this century possibly reaching c above present levels enough to devastate crops and make life in many cities unbearably hot as temperatures rise and oceans warm tropical and subtropical regions will see sharp changes in annual rainfall says the intergovernmental panel on climate change ipcc report released in stockholm and published online in september east africa can expect increased short rainfalls and west africa should expect heavier monsoons burma bangladesh and 

In [52]:
import numpy as np

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, LSTM, Bidirectional
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.layers import Input, Embedding, Activation, Flatten, Dense, concatenate
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer 
import os, re, math, codecs

np.random.seed(0)

import zipfile
with zipfile.ZipFile("/content/gdrive/MyDrive/fiction_previews/embs/wiki-news-300d-1M.vec.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/1")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#load embeddings
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('/content/1/wiki-news-300d-1M.vec', encoding='utf-8')
#f = codecs.open('/content/1/glove.6B.200d.txt', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

loading word embeddings...


999995it [02:00, 8289.97it/s]

found 999995 word vectors





In [67]:
num_classes = 3

MAX_NB_WORDS = 20000
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

print("pre-processing train data...")
processed_docs_train = []
for doc in tqdm(train_texts):
    tokens = tokenizer.tokenize(doc)
    tokens = doc.split()
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs_train.append(" ".join(filtered))

print("tokenizing input data...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_docs_train)  #leaky
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

max_seq_len = 0
for t in word_seq_train:
  if len(t)>max_seq_len:
    max_seq_len = len(t)

print(max_seq_len)

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)

pre-processing train data...


100%|██████████| 567/567 [00:00<00:00, 7944.42it/s]

tokenizing input data...





dictionary size:  15524
763


In [81]:
#embedding matrix
print('preparing embedding matrix...')
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index))
nb_words = MAX_NB_WORDS
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 5780


In [82]:
# architecture
def create_model(network_type):
  if network_type == 'CNN':
    print("training CNN ...")
    model = Sequential()
    model.add(Embedding(nb_words, embed_dim,
              weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
    model.add(Conv1D(num_filters, 2, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(num_filters, 2, activation='relu', padding='same'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[m])
    #model.summary()
    
  if network_type == 'LSTM':
    print("training LSTM ...")
    model = Sequential()
    model.add(Embedding(nb_words, embed_dim,
              weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
    #model.add(Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.2, return_sequences = True)))
    model.add(Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.2)))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[m])
  return model

In [71]:
#define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=20, verbose=1)
callbacks_list = [early_stopping]

In [83]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
import tensorflow as tf

seed = 42

f1=[]
recall=[]
prec=[]

for fold in range(5):
  model = create_model(network_type)

  texts_splitted = np.array_split(word_seq_train, 5)
  classes_splitted = np.array_split(train_labels, 5)
  test_fold_texts = texts_splitted[fold]
  test_fold_classes = classes_splitted[fold]

  train_fold_texts = []
  train_fold_classes = []
  for i,el in enumerate(texts_splitted):
    if i!=fold:
      train_fold_texts.extend(el)
      train_fold_classes.extend(classes_splitted[i])

  train_fold_classes = tf.keras.utils.to_categorical(train_fold_classes)
  test_fold_classes = tf.keras.utils.to_categorical(test_fold_classes)

  #model training
  #callbacks=callbacks_list,
  hist = model.fit(np.array(train_fold_texts), np.array(train_fold_classes), batch_size=batch_size, callbacks=callbacks_list,epochs=num_epochs, validation_split=0.1, shuffle=True, verbose=2)
  y_predict= model.predict(np.array(test_fold_texts))

  y_predict = y_predict.argmax(axis = 1)
  test_fold_classes = test_fold_classes.argmax(axis=1)

  f1.append(f1_score(y_predict, test_fold_classes, average='weighted')*100)
  prec.append(precision_score(y_predict, test_fold_classes, average = 'weighted')*100)
  recall.append(recall_score(y_predict, test_fold_classes, average = 'weighted')*100)

  print(f1)

training LSTM ...
Epoch 1/100
2/2 - 94s - loss: 1.1074 - auc_5: 0.5064 - val_loss: 1.1032 - val_auc_5: 0.5229
Epoch 2/100
2/2 - 19s - loss: 1.1001 - auc_5: 0.5457 - val_loss: 1.1040 - val_auc_5: 0.4994
Epoch 3/100
2/2 - 18s - loss: 1.1018 - auc_5: 0.5298 - val_loss: 1.1042 - val_auc_5: 0.4986
Epoch 4/100
2/2 - 19s - loss: 1.0987 - auc_5: 0.5460 - val_loss: 1.1043 - val_auc_5: 0.5096
Epoch 5/100
2/2 - 19s - loss: 1.0984 - auc_5: 0.5493 - val_loss: 1.1045 - val_auc_5: 0.4976
Epoch 6/100
2/2 - 19s - loss: 1.1003 - auc_5: 0.5289 - val_loss: 1.1043 - val_auc_5: 0.4926
Epoch 7/100
2/2 - 18s - loss: 1.0942 - auc_5: 0.5598 - val_loss: 1.1040 - val_auc_5: 0.5024
Epoch 8/100
2/2 - 19s - loss: 1.0827 - auc_5: 0.6129 - val_loss: 1.1037 - val_auc_5: 0.5013
Epoch 9/100
2/2 - 19s - loss: 1.0708 - auc_5: 0.6516 - val_loss: 1.1039 - val_auc_5: 0.5063
Epoch 10/100
2/2 - 18s - loss: 1.0783 - auc_5: 0.6039 - val_loss: 1.1035 - val_auc_5: 0.5096
Epoch 11/100
2/2 - 19s - loss: 1.0663 - auc_5: 0.6282 - val_l

  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/100
2/2 - 25s - loss: 1.1073 - auc_5: 0.4930 - val_loss: 1.1036 - val_auc_5: 0.5224
Epoch 2/100
2/2 - 19s - loss: 1.1084 - auc_5: 0.4800 - val_loss: 1.1053 - val_auc_5: 0.4956
Epoch 3/100
2/2 - 19s - loss: 1.1070 - auc_5: 0.4982 - val_loss: 1.1073 - val_auc_5: 0.4793
Epoch 4/100
2/2 - 19s - loss: 1.0997 - auc_5: 0.5423 - val_loss: 1.1099 - val_auc_5: 0.4589
Epoch 5/100
2/2 - 19s - loss: 1.0998 - auc_5: 0.5409 - val_loss: 1.1107 - val_auc_5: 0.4449
Epoch 6/100
2/2 - 18s - loss: 1.0981 - auc_5: 0.5514 - val_loss: 1.1096 - val_auc_5: 0.4549
Epoch 7/100
2/2 - 19s - loss: 1.0965 - auc_5: 0.5580 - val_loss: 1.1066 - val_auc_5: 0.4783
Epoch 8/100
2/2 - 19s - loss: 1.0872 - auc_5: 0.6052 - val_loss: 1.1039 - val_auc_5: 0.4935
Epoch 9/100
2/2 - 19s - loss: 1.0894 - auc_5: 0.6039 - val_loss: 1.1009 - val_auc_5: 0.5162
Epoch 10/100
2/2 - 19s - loss: 1.0890 - auc_5: 0.5855 - val_loss: 1.0988 - val_auc_5: 0.5165
Epoch 11/100
2/2 - 19s - loss: 1.0721 - auc_5: 0.6476 - val_loss: 1.0933 - val_

In [79]:
print(sum(f1)/5)
print(np.std(np.array(f1)))
print(sum(prec)/5)
print(np.std(np.array(prec)))
print(sum(recall)/5)
print(np.std(np.array(recall)))

32.555559395919445
8.950460235908238
37.884257715332
12.27658143981291
30.352429746933705
6.5212865995919715
