In [3]:
import re
import pandas as pd
import numpy as np
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Preprocessing
import nltk
from nltk.tokenize import sent_tokenize

# Modeling
import keras
from keras.models import Model, load_model
from keras.layers import Dense, Input, Dropout, MaxPooling1D, Conv1D, GlobalMaxPool1D, Bidirectional
from keras.layers import LSTM, Lambda, Bidirectional, concatenate, BatchNormalization, Embedding
from keras.layers import TimeDistributed
from keras.optimizers import Adam
import tensorflow as tf
import keras.backend as K

import IPython
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

Using TensorFlow backend.


In [4]:
from Models.functions.datasets import loadTrainTest
from Models.functions.preprocessing import clean, labelEncoder, oversampling

## Modeling


In [42]:
def build_cnn(embedding_layer=None, input_dim=None,
              embedding_dim=None, filter_sizes=[3,4,5],
              feature_maps=[100,100,100], max_seq_length=100, dropout_rate = None, 
              n_classes = 2, dense_units = None, pool_size = [1,1,1], strides = [1,1,1], layers = False):

    if embedding_layer is None:
        embedding_layer = Embedding(input_dim=input_dim, output_dim=embedding_dim,
                                    input_length=max_seq_length,
                                    weights=None,
                                    trainable=True
                                   )
    
    
    x_in = Input(shape=(max_seq_length,), dtype='int32')    
    
    emb_layer = embedding_layer(x_in)
    
    channels = []
    for ix in range(len(filter_sizes)):        
        channel1 = Conv1D(feature_maps[ix], kernel_size=filter_sizes[ix], activation='relu', strides=strides[ix],
               padding='same', kernel_regularizer=regularizers.l2(0.03))(emb_layer)        
        channel1 = MaxPooling1D(pool_size=pool_size[ix], strides=strides[ix], padding='valid')(channel1)
        channels.append(channel1)
    
    # Concatenate all channels
    if len(filter_sizes) > 1:
        x = concatenate(channels)
    else:
        x = channel1

    #x = MaxPooling1D(pool_size=2, strides=1, padding='same')(x)
    x = GlobalMaxPooling1D()(x)
    #x = Flatten()(x)

    if dropout_rate is not None:
        x = Dropout(dropout_rate)(x)

    if dense_units is not None:        
        for d_units in dense_units:            
            x = Dense(units = d_units, activation = 'relu')(x)

    x = Dense(n_classes, activation='softmax')(x)
    
    return Model(inputs=x_in, outputs=x)

In [6]:
X, _, y, _ = loadTrainTest('education','brmoral','/home/rafael/Dataframe/', lang='pt')
print(display(pd.DataFrame({"text": X[0:5], "label": y[0:5]})))

print("Number of authors: {0} {1}".format(len(X), len(y)))

y, n_classes, classes_names = labelEncoder(y)

X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.1)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

Unnamed: 0,label,text
0,s012,"Como direito individual , cada ser humano tem ..."
1,s3,"Não importa o gênero , o mais importante é a a..."
2,s4,"O casamento , enquanto ato oficial , represent..."
3,s3,O casamento é além de um direito civil de asse...
4,s4,Minha religião se baseia em conceitos de que ...


None
Number of authors: 346 346


((311,), (311,), (35,), (35,))

In [39]:
n = [len(x) for x in X_train]
print(int(np.mean(n)), int(np.median(n)))
length_char = int(np.median(n))

2631 2283


In [11]:
# convert string to lower case

train_texts = []
train_texts = [i for i in X_train]

test_texts = []
test_texts = [i for i in X_test]
"""
for i in range(0, len(X_train)):
    arr = []
    for j in range(0,len(X_train[i])):
        arr.append(X_train[i][j:j+3])
    train_texts.append(arr)


for i in range(0, len(X_test)):
    arr = []
    for j in range(0,len(X_test[i])):
        arr.append(X_test[i][j:j+3])
    test_texts.append(arr)
"""
np.array(train_texts).shape, np.array(test_texts).shape

((311,), (35,))

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# =======================Convert string to index================
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)


# If we already have a character list, then replace the tk.word_index
# If not, just skip below part

# -----------------------Skip part start--------------------------
# construct a new vocabulary
alphabet2 = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

# Use char_dict to replace the tk.word_index
#tk.word_index = char_dict.copy()
# Add 'UNK' to the vocabulary
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
#vect.vocabulary_[len(vect.vocabulary_)] = 'UNK'


# -----------------------Skip part end----------------------------

# Convert string to index
train_sequences = tk.texts_to_sequences(train_texts)
test_texts = tk.texts_to_sequences(test_texts)

# Padding
train_data = pad_sequences(train_sequences, maxlen=length_char, padding='post')
test_data = pad_sequences(test_texts, maxlen=length_char, padding='post')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

# =======================Get classes================


from keras.utils import to_categorical

train_classes = to_categorical(y_train)
test_classes = to_categorical(y_test)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

#vect = TfidfVectorizer(use_idf=False, analyzer='char', ngram_range=(2,3))
#vect.fit_transform(['teste isso é um teste','isso não é um teste']).toarray()

#vect.vocabulary_

In [29]:
vocab_size = 69# len(tk.word_index)
vocab_size, train_data.shape, test_data.shape

(69, (311, 2283), (35, 2283))

In [30]:
tk.word_index.items()

dict_items([('~', 62), ('u', 13), ('1', 40), (' ', 2), ('b', 24), ('ê', 34), ('s', 6), ('\x93', 54), ('n', 11), ('"', 33), ('2', 49), ('w', 61), ('g', 21), ('c', 14), ('7', 57), ('í', 31), ('\xa0', 63), ('a', 4), ('d', 9), ('e', 3), ('k', 53), ('ú', 37), ('ô', 46), ('?', 41), ('8', 48), ('õ', 36), ('h', 26), ('9', 56), ('>', 64), ('v', 18), ('3', 52), (':', 44), (',', 17), ('0', 45), ('y', 58), ('á', 30), ('r', 8), ('é', 27), ('z', 28), ('º', 60), ('p', 15), ('ã', 19), ('à', 39), ('f', 23), ('q', 20), ('m', 10), ('-', 38), ('UNK', 69), ('!', 47), ('j', 29), ('t', 12), ('â', 43), ('\x94', 55), ('o', 5), ('i', 7), ('l', 16), ("'", 59), ('6', 42), ('x', 32), ('.', 22), ('ç', 25), ('ó', 35), ('4', 51), ('5', 50)])

In [31]:
train_data[0]

array([ 4.,  6.,  2., ...,  4.,  2., 22.], dtype=float32)

In [33]:
# onehot - unigrams 
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))

for char, i in tk.word_index.items():    
    onehot = np.zeros(vocab_size)    
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

In [34]:
onehot

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

In [35]:
print(embedding_weights.shape)
embedding_weights

(65, 69)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [36]:
train_data.shape, test_data.shape

((311, 2283), (35, 2283))

In [43]:
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras import regularizers
from keras.layers.pooling import GlobalMaxPooling1D
from Models.functions.plot import plot_history, full_multiclass_report, plot_confusion_matrix

input_size = length_char
embedding_size = vocab_size
embedding_layer = Embedding(65, output_dim=embedding_size, input_length=input_size, weights=[embedding_weights], trainable=True)

model = build_cnn(embedding_layer=embedding_layer, input_dim=input_size, max_seq_length=input_size, embedding_dim=embedding_size, filter_sizes=[4],
                 n_classes=n_classes if n_classes > 2 else 1)
model.compile(
                loss='categorical_crossentropy',
                #loss='mean_squared_error',
                optimizer='rmsprop',
                metrics=['accuracy']
        )
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 2283)              0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 2283, 69)          4485      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 2283, 100)         27700     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2283, 100)         0         
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 303       
Total params: 32,488
Trainable params: 32,488
Non-trainable params: 0
_________________________________________________________________


In [44]:
## Then train it and display the results
print("Total data: ", len(X), "train: ", len(train_data), "test: ", len(test_data))

#x_train, y_train = train_data, train_classes#oversampling(train_data, train_classes)
x_train, y_train = oversampling(train_data, train_classes)
#x_test, y_test = test_data, test_classes#oversampling(test_data, test_classes)
x_test, y_test = oversampling(test_data, test_classes)

#x_train = train_data
#y_train = train_classes
#x_test = test_data
#y_test = test_classes

history = model.fit(x_train,
                    y_train,
                    epochs=50,
                    validation_data=(x_test, y_test),
                    #batch_size=params['batch_size'],
                    verbose = 1,
                       callbacks=[
                           #ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, min_lr=0.01),
                           EarlyStopping(monitor='val_loss', min_delta=0.001, patience=4, verbose=1)
                  ])

directory='/tmp/'


plot_history(history, directory=directory, show=True)

full_multiclass_report(model,
                       x_test,
                       y_test,
                       classes=classes_names,
                       directory=directory
                      )
                       #batch_size=32,
                       #binary= )
        
        
#result = result
# get_results(model, y_test, model.predict_classes(X_test))

Total data:  346 train:  311 test:  35
Train on 354 samples, validate on 36 samples
Epoch 1/50

InvalidArgumentError: indices[14,47] = 69 is not in [0, 65)
	 [[{{node embedding_4/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_4/embeddings/read, _arg_input_4_0_0, embedding_4/embedding_lookup/axis)]]

In [13]:
x_train.shape

(1360, 17006)