In [1]:
import keras, os, pickle, re, sklearn, string, tensorflow
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.layers import Embedding
from keras.optimizers import Adadelta
from sklearn.model_selection import train_test_split


print('Keras version: \t\t%s' % keras.__version__)
print('Scikit version: \t%s' % sklearn.__version__)
print('TensorFlow version: \t%s' % tensorflow.__version__)

Using TensorFlow backend.


Keras version: 		2.2.0
Scikit version: 	0.19.2
TensorFlow version: 	1.5.0


### Parameter

In [2]:
# EMBEDDING
MAX_NUM_WORDS  = 50000 #15000
EMBEDDING_DIM  = 300
MAX_SEQ_LENGTH = 700 #200
USE_GLOVE      = False

# MODEL
FILTER_SIZES   = [3,4,5]
FEATURE_MAPS   = [10,10,10]
DROPOUT_RATE   = 0.5

# LEARNING
BATCH_SIZE     = 200
NB_EPOCHS      = 40
RUNS           = 5
VAL_SIZE       = 0.2

### Preprocessing

In [3]:
def clean_doc(doc):
    """
    Cleaning a document by several methods:
        - Lowercase
        - Removing whitespaces
        - Removing numbers
        - Removing stopwords
        - Removing punctuations
        - Removing short words
    """
    stop_words = set(stopwords.words('english'))
    
    # Lowercase
    doc = doc.lower()
    # Remove numbers
    doc = re.sub(r"[0-9]+", "", doc)
    # Split in tokens
    tokens = doc.split()
    # Remove Stopwords
    tokens = [w for w in tokens if not w in stop_words]
    # Remove punctuation
    tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)


def read_files(path):
    documents = list()
    # Read in all files in directory
    if os.path.isdir(path):
        for filename in os.listdir(path):
            with open('%s/%s' % (path, filename)) as f:
                doc = f.read()
                doc = clean_doc(doc)
                documents.append(doc)
    
    # Read in all lines in a txt file
    if os.path.isfile(path):        
        with open(path, encoding='iso-8859-1') as f:
            doc = f.readlines()
            for line in doc:
                documents.append(clean_doc(line))
    return documents

In [4]:
import pandas as pd
# Load data
train = pd.read_csv('/home/rafael/datasets/preprocessed/pan13/training-en.csv')
test = pd.read_csv('/home/rafael/datasets/preprocessed/pan13/test1-en.csv')
test2 = pd.read_csv('/home/rafael/datasets/preprocessed/pan13/test2-en.csv')

docs_train = train["text"].fillna("_NaN_").values
y_train = train.target.values

docs_test = test["text"].fillna("_NaN_").values
y_test = test.target.values

docs_test2 = test2["text"].fillna("_NaN_").values
y_test2 = test2.target.values

In [5]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(y_train)
y = le.transform(y_train)

le_test = preprocessing.LabelEncoder()
le_test.fit(y_test)
y_test = le_test.transform(y_test)

le_test2 = preprocessing.LabelEncoder()
le_test2.fit(y_test2)
y_test2 = le_test2.transform(y_test2)

In [6]:
## Sentence polarity dataset v1.0
#negative_docs = read_files('data/rt-polarity.neg')
#positive_docs = read_files('data/rt-polarity.pos')

## IMDB
#negative_docs = read_files('data/imdb/train/neg')
#positive_docs = read_files('data/imdb/train/pos')
#negative_docs_test = read_files('data/imdb/test/neg')
#positive_docs_test = read_files('data/imdb/test/pos')

## Yelp
#negative_docs = read_files('data/yelp/train/neg')
#positive_docs = read_files('data/yelp/train/pos')
#negative_docs_test = read_files('data/yelp/test/neg')
#positive_docs_test = read_files('data/yelp/test/pos')

#docs   = negative_docs + positive_docs
#labels = [0 for _ in range(len(negative_docs))] + [1 for _ in range(len(positive_docs))]

print('Training samples: %i' % len(docs_train))
print('Test samples: %i' % len(docs_test))
print('Test2 samples: %i' % len(docs_test2))

Training samples: 236600
Test samples: 21200
Test2 samples: 25440


## Tokenizer

In [7]:
def max_length(lines):
    """
    Calculate the maximum document length
    """
    return max([len(s.split()) for s in lines])

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(docs_train)
sequences = tokenizer.texts_to_sequences(docs_train)

In [8]:
length = max_length(docs_train)
word_index = tokenizer.word_index

result = [len(x.split()) for x in docs_train]
print('Text informations:')
print('max length: %i / min length: %i / mean length: %i / limit length: %i' % (np.max(result),
                                                                                np.min(result),
                                                                                np.mean(result),
                                                                                MAX_SEQ_LENGTH))
print('vacobulary size: %i / limit: %i' % (len(word_index), MAX_NUM_WORDS))

# Padding all sequences to same length of `MAX_SEQ_LENGTH`
X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')

Text informations:
max length: 60580 / min length: 1 / mean length: 761 / limit length: 700
vacobulary size: 1106728 / limit: 50000


## Embeddings

In [9]:
def create_glove_embeddings():
    print('Pretrained embeddings GloVe is loading...')

    #embeddings_index = {}
    #glove_path = '/home/rafael/embeddings/glove.840B.300d.txt'
    #f = open('glove.6B.%id.txt' % EMBEDDING_DIM)
    #f = open(glove_path)
    #for line in f:
        #values = line.split()
        #word = values[0]
        #coefs = np.asarray(values[1:], dtype='float32')
        #embeddings_index[word] = coefs
    #f.close()
    
    embeddings_index = {}        
    f = open(r'/home/rafael/embeddings/wiki.en.vec', encoding='utf8')
    for line in f:
        values = line.split()
        word = ''.join(values[:-300])
        coefs = np.asarray(values[-300:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    
    print('Found %s word vectors in GloVe embedding' % len(embeddings_index))

    embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))

    for word, i in tokenizer.word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return Embedding(input_dim=MAX_NUM_WORDS, output_dim=EMBEDDING_DIM,
                     input_length=MAX_SEQ_LENGTH,
                     weights=[embedding_matrix],
                     trainable=True
                    )

## Model definition

## Training

In [None]:
import cnn_model

histories = []
RUNS = 1
for i in range(RUNS):
    print('Running iteration %i/%i' % (i+1, RUNS))
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SIZE, random_state=42)
    
    emb_layer = None
    if USE_GLOVE:
        emb_layer = create_glove_embeddings()
    
    model = cnn_model.build_cnn(
        embedding_layer=emb_layer,
        num_words=MAX_NUM_WORDS,
        embedding_dim=EMBEDDING_DIM,
        filter_sizes=FILTER_SIZES,
        feature_maps=FEATURE_MAPS,
        max_seq_length=MAX_SEQ_LENGTH,
        dropout_rate=DROPOUT_RATE
    )
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adadelta(clipvalue=3),
        metrics=['accuracy']
    )
    
    history = model.fit(
        X_train, y_train,
        epochs=NB_EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=1,
        validation_data=(X_val, y_val),
        callbacks=[ModelCheckpoint('model-%i.h5'%(i+1), monitor='val_loss',
                                   verbose=1, save_best_only=True, mode='min'),
                   ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, min_lr=0.01),
                   #EarlyStopping(monitor='val_loss', min_delta=0.1, patience=4, verbose=1)
                  ]
    )
    print()
    histories.append(history.history)

Running iteration 1/1
Creating CNN 0.0.1
#############################################
Embedding:    no pre-trained embedding
Vocabulary size: 50000
Embedding dim: 300
Filter sizes: [3, 4, 5]
Feature maps: [10, 10, 10]
Max sequence: 700
#############################################
Train on 189280 samples, validate on 47320 samples
Epoch 1/40

Epoch 00001: val_loss improved from inf to 0.68687, saving model to model-1.h5
Epoch 2/40

Epoch 00002: val_loss improved from 0.68687 to 0.68463, saving model to model-1.h5
Epoch 3/40

Epoch 00003: val_loss improved from 0.68463 to 0.68410, saving model to model-1.h5
Epoch 4/40

Epoch 00004: val_loss improved from 0.68410 to 0.68233, saving model to model-1.h5
Epoch 5/40
 13400/189280 [=>............................] - ETA: 26:13 - loss: 0.6838 - acc: 0.5599

In [None]:
with open('history.pkl', 'wb') as f:
    pickle.dump(histories, f)

## Evaluation

In [None]:
histories = pickle.load(open('history.pkl', 'rb'))

In [None]:
def get_avg(histories, his_key):
    tmp = []
    for history in histories:
        tmp.append(history[his_key][np.argmin(history['val_loss'])])
    return np.mean(tmp)
    
print('Training: \t%0.4f loss / %0.4f acc' % (get_avg(histories, 'loss'),
                                              get_avg(histories, 'acc')))
print('Validation: \t%0.4f loss / %0.4f acc' % (get_avg(histories, 'val_loss'),
                                                get_avg(histories, 'val_acc')))

In [None]:
def plot_acc_loss(title, histories, key_acc, key_loss):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    # Accuracy
    ax1.set_title('Model accuracy (%s)' % title)
    names = []
    for i, m in enumerate(histories):
        ax1.plot(m[key_acc])
        ax1.set_xlabel('epoch')
        names.append('Model %i' % (i+1))
        ax1.set_ylabel('accuracy')
    ax1.legend(names, loc='lower right')
    # Loss
    ax2.set_title('Model loss (%s)' % title)
    for m in histories:
        ax2.plot(m[key_loss])
        ax2.set_xlabel('epoch')
        ax2.set_ylabel('loss')
    ax2.legend(names, loc='upper right')
    fig.set_size_inches(20, 5)
    plt.show()

In [None]:
plot_acc_loss('training', histories, 'acc', 'loss')
plot_acc_loss('validation', histories, 'val_acc', 'val_loss')

##### Final test (Test1 / Test2)

In [None]:
X_test = docs_test
X_test2 = docs_test2
#y_test = [0 for _ in range(len(negative_docs_test))] + [1 for _ in range(len(positive_docs_test))]

In [None]:
sequences_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences_test, maxlen=MAX_SEQ_LENGTH, padding='post')

sequences_test2 = tokenizer.texts_to_sequences(X_test2)
X_test2 = pad_sequences(sequences_test2, maxlen=MAX_SEQ_LENGTH, padding='post')

In [None]:
test_loss = []
test_accs = []

RUNS = 1

for i in range(0,RUNS):
    cnn_ = load_model("model-%i.h5" % (i+1))
    
    score = cnn_.evaluate(X_test, y_test, verbose=1)
    test_loss.append(score[0])
    test_accs.append(score[1])
    
    print('Running test with model %i: %0.4f loss / %0.4f acc' % (i+1, score[0], score[1]))
    
print('\nAverage loss / accuracy on testset: %0.4f loss / %0.4f acc' % (np.mean(test_loss),
                                                                        np.mean(test_accs)))
print('Standard deviation: (+-%0.4f) loss / (+-%0.4f) acc' % (np.std(test_loss), np.std(test_accs)))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
def predictTest(model, X_test, y_test):
    y_pred = model.predict(X_test, verbose=1)
    cm = confusion_matrix(y_test, y_pred.round()) 
    print("Confusion Matrix")
    print(cm)
    print("Report")
    print(classification_report(y_test, y_pred.round()))

In [None]:
predictTest(model, X_test, y_test)

In [None]:
test_loss = []
test_accs = []

RUNS = 1

for i in range(0,RUNS):
    cnn_ = load_model("model-%i.h5" % (i+1))
    
    score = cnn_.evaluate(X_test2, y_test2, verbose=1)
    test_loss.append(score[0])
    test_accs.append(score[1])
    
    print('Running test with model %i: %0.4f loss / %0.4f acc' % (i+1, score[0], score[1]))
    
print('\nAverage loss / accuracy on testset: %0.4f loss / %0.4f acc' % (np.mean(test_loss),
                                                                        np.mean(test_accs)))
print('Standard deviation: (+-%0.4f) loss / (+-%0.4f) acc' % (np.std(test_loss), np.std(test_accs)))

In [None]:
predictTest(model, X_test2, y_test2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
clf = LogisticRegression(C=1e5)
clf.fit(X,y)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print('ACC', accuracy_score(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred) 
print("Confusion Matrix")
print(cm)
print("Report")
print(classification_report(y_test, y_pred))

In [None]:
y_pred2 = clf.predict(X_test2)

In [None]:
print('ACC', accuracy_score(y_test2, y_pred2))
cm = confusion_matrix(y_test2, y_pred2) 
print("Confusion Matrix")
print(cm)
print("Report")
print(classification_report(y_test2, y_pred2))