In [1]:
from keras.layers import Activation, Input, Dense, Flatten, Dropout, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras import regularizers
from keras.models import Model, Sequential
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import itertools
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

result = pd.DataFrame()

Using TensorFlow backend.


In [2]:
from Models.functions.plot import plot_history, full_multiclass_report
from Models.functions.preprocessing import clean, labelEncoder


In [3]:
from keras.models import Model
from keras.layers import Input, Dense, Flatten
from keras.layers import Convolution1D
from keras.layers import MaxPooling1D
from keras.layers import Embedding
from keras.layers import ThresholdedReLU
from keras.layers import Dropout
from keras.callbacks import TensorBoard

In [4]:
# CharCNNZhang
def build_model( 
                input_size, 
                alphabet_size, 
                conv_layers,
                fully_connected_layers,
                embedding_size, 
                threshold, 
                dropout_p, 
                num_of_classes, 
                optimizer='adam', 
                #loss='categorical_crossentropy'
                loss='sparse_categorical_crossentropy'
               ):
    """
    Build and compile the Character Level CNN model
    Returns: None
    """
    # Input layer
    inputs = Input(shape=(input_size,), name='sent_input', dtype='int64')
    # Embedding layers
    x = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs)
    # Convolution layers
    for cl in conv_layers:
        x = Convolution1D(cl[0], cl[1])(x)
        x = ThresholdedReLU(threshold)(x)
        if cl[2] != -1:
            x = MaxPooling1D(cl[2])(x)
    x = Flatten()(x)
    # Fully connected layers
    for fl in fully_connected_layers:
        x = Dense(fl)(x)
        x = ThresholdedReLU(threshold)(x)
        x = Dropout(dropout_p)(x)
    # Output layer
    predictions = Dense(num_of_classes, activation='softmax')(x)
    # Build and compile model
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer=optimizer, loss=loss)
    model = model
    print("CharCNNZhang model built: ")
    model.summary()
    return model

### Load dataset

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from Models.functions.datasets import loadTrainTest

X, _, y, _ = loadTrainTest("gender", "brblogset", "/home/rafael/GDrive/Data/Dataframe/")
y, n_classes, classes_names = labelEncoder(y)
#vect = CountVectorizer(analyzer="word", max_features=30000)
#X_tfidf = vect.fit_transform(X).toarray()
#X_tfidf.shape

### Data

In [6]:
len('abc')

3

In [7]:
mean = int(np.mean([len(i) for i in X.values]))
median = int(np.median([len(i) for i in X.values]))
mean, median

(38567, 16186)

In [8]:
def format_data(data):

    # str_to_indexes
    def str_to_indexes(s, length, dict_char):
        """
        Convert a string to character indexes based on character dictionary.

        Args:
            s (str): String to be converted to indexes
        Returns:
            str2idx (np.ndarray): Indexes of characters in s
        """
        s = s.lower()
        max_length = min(len(s), length)
        str2idx = np.zeros(length, dtype='int64')
        for i in range(1, max_length + 1):
            c = s[-i]
            if c in dict_char:
                str2idx[i - 1] = dict_char[c]
        return str2idx
        

    alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    alphabet_size = len(alphabet)
    input_size = 1024
    dict_char = {}
    for idx, char in enumerate(alphabet):
        dict_char[char] = idx + 1
    length = input_size

    data_size = len(data)
    
    """
    Return all loaded data from data variable.
    Returns:
        (np.ndarray) Data transformed from raw to indexed form with associated one-hot label.
    """
    data_size = len(data)
    start_index = 0
    end_index = data_size
    batch_texts = data[start_index:end_index]
    batch_indices = []
    #one_hot = np.eye(no_of_classes, dtype='int64')
    classes = []
    for s in batch_texts:
        batch_indices.append(str_to_indexes(s, length, dict_char))
        #c = int(c) - 1
        #classes.append(one_hot[c])
    return np.asarray(batch_indices, dtype='int64'), np.asarray(classes), dict_char, alphabet_size

X_char, cl, dict_char, alphabet_size = format_data(X.values)

### Split

In [9]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

x_train, x_test, y_train, y_test = train_test_split(X_char, y, test_size = 0.2)

#X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
#X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

#y_train = to_categorical(y_train, 2)
#y_test = to_categorical(y_test, 2)

x_test.shape, x_train[1].shape

((417, 1024), (1024,))

In [10]:
X_train.shape[1]

NameError: name 'X_train' is not defined

### Model

In [None]:
params_grid = dict(
        input_size = X_train.shape[1],
        alphabet_size = alphabet_size,
        embedding_size = 128,
        # feature maps, kernel, maxpooling
        conv_layers = [[256,7,3]],#,[256,7,3],[256,3,-1],[256,3,-1],[256,3,-1],[256,3,3]],
        fully_connected_layers = [1024],#, 2014],
        threshold = 1e-6,
        dropout_p = 0.5,
        num_of_classes = n_classes,
        epochs = 5000,
        batch_size = 64
)

In [None]:
params = params_grid

## create the model with the best params found
model = build_model(
    input_size=params["input_size"],
     alphabet_size=params["alphabet_size"],
     embedding_size=params["embedding_size"],
     conv_layers=params["conv_layers"],
     fully_connected_layers=params["fully_connected_layers"],
     num_of_classes=params["num_of_classes"],
     threshold=params["threshold"],
     dropout_p=params["dropout_p"],
     #optimizer=params["optimizer"],
     #loss=params["loss"])
)
## Then train it and display the results
history = model.fit(x_train,
                    y_train,
                    epochs=params['epochs'],
                    validation_split=0.2,
                    batch_size=params['batch_size'],
                    verbose = 1,
                       callbacks=[
                           #ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, min_lr=0.01),
                           EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
                  ])

model.summary()

directory='/home/rafael/'

plot_history(history, directory=directory, show=True)

full_multiclass_report(model,
                       x_test,
                       y_test,
                       classes=classes_names,
                       directory=directory
                      )
                       #batch_size=32,
                       #binary= )