In [1]:
import numpy as np
import tensorflow as tf
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
tf.enable_eager_execution()

In [4]:
def load_data(folder_path):
    '''
    Loads the JSON files puts them into a Python list (X) of strings
    where each element in the list is a single question. It also sotres the category labels
    into another list (y)
    :param folder_path: the folder path that has json file for each category containing questions and answers
    :return:
    '''
    print("loading the data ... ")
    counter = 0
    X = []
    y = []
    categories = []
    for f in os.listdir(folder_path):
        if f.endswith("json"):  # read all json files
            file_path = os.path.join(folder_path, f)
            contents = json.load(open(file_path, 'r', encoding='utf-8'))
            for html_item in contents:
                X.append(html_item['question'])
                y.append(counter)
            categories.append(f.replace(".json", ""))
            counter = counter + 1
    print("current categories: "+str(categories))
    return X, y, categories

In [5]:
def pre_process(X, y):
    '''
    Tokenize and test and train split
    :param X: full data
    :param y: full labels
    :param MAX_SEQUENCE_LENGTH: maximum tokens in each training instance
    :param MAX_VOCAB_SIZE:  maximum number of words in vocab
    :return:
    '''
    # Tokenization
    filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n0123456789'
    tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters=filters)
    tokenizer.fit_on_texts(X)
    # Create a sequence of words
    sequences = tokenizer.texts_to_sequences(X)

    word2idx = tokenizer.word_index
 
    X_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
 
    y_train = to_categorical(y)
    
    # Train-test-split
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.01)

    
    pickle.dump(word2idx, open(REL_PATH + "/word2idx.pkl", "wb"))
    pickle.dump(tokenizer, open(REL_PATH + "/tokenizer.pkl", "wb"))
    return X_train, X_test, y_train, y_test, word2idx

In [6]:
def get_embeddings(path, word_index):
    '''
    load the embeddings and create the matrix
    :param path: the path to embeddings
    :param word_index: the dictionary of the words to indexs
    :param MAX_VOCAB_SIZE: maximum number of words in vocab
    :return: the indexs to embedding dicionary, embedding matrix to be use for training
    '''
    # Create a word - vector embedding dictionary (loading only the embeddings for words in our dictionary)
    print('Loading word vectors...')
    embeddings_index = {}
    with open(path + "/vectors.txt", 'r', encoding='utf-8') as f:
        for line in f:
            vector = line.split(' ')
            word = vector[0]
            if word in word_index:
                embeddings_index[word] = np.array(vector[1:], dtype=np.float32)
                
    # Create embedding matrix to laod into keras layer
    print('Creating the emebdding matrix')
    embeddings_matrix = np.zeros((MAX_VOCAB_SIZE, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i < MAX_VOCAB_SIZE:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embeddings_matrix[i] = embedding_vector
    return embeddings_index, embeddings_matrix

In [7]:
class ClassifierModel(tf.keras.Model):
    
    def __init__(self, embeddings_matrix, n_classes, max_len=500, hidden_dim=128):
        super(ClassifierModel, self).__init__()
        
        # Constuct from existing embeddings matrix
        if type(embeddings_matrix) is np.ndarray:
            self.embedding = tf.keras.layers.Embedding(
                                embeddings_matrix.shape[0],
                                embeddings_matrix.shape[1],
                                weights=[embeddings_matrix],
                                input_length=max_len,
                                trainable=False)
            
        # embeddings matrix is just a tuple with vocab size and embedding_dim
        else:
            self.embedding = tf.keras.layers.Embedding(embeddings_matrix[0], embeddings_matrix[1])
        self.lstm = tf.keras.layers.Bidirectional(tf.keras.layers.CuDNNLSTM(hidden_dim, return_sequences=True))
        self.dense = tf.keras.layers.Dense(n_classes)
        self.max_pool = tf.keras.layers.GlobalMaxPool1D()
        
    def call(self, inputs, probs=False):
        
        X = self.embedding(inputs)
        X = self.lstm(X)
        X = self.max_pool(X)
        logits = self.dense(X)
        if probs:
            return tf.nn.softmax(logits, axis=-1)
        return logits
    
    def compute_loss(self, y_true, y_pred_logits):
        """Computes the loss between predicted and true labels."""
        
        return tf.losses.softmax_cross_entropy(y_true, y_pred_logits)
    
    def score(self, X, y):
        """Computes and returns the confusion matrix and accuracy score on given validation X and y"""
        
        # Make sure X is tensor
        if type(X) is np.ndarray:
            X = tf.convert_to_tensor(X, dtype=tf.float32)

        y_pred = self.call(X, probs=True).numpy()
        
        # Makse sure y is a numpy array (dumb sklearn uses len())
        if type(y) is not np.ndarray:
            y = y.numpy()
        acc = accuracy_score(np.argmax(y, axis=1), np.argmax(y_pred, axis=1))
        cm = confusion_matrix(np.argmax(y, axis=1), np.argmax(y_pred, axis=1))
        return acc, cm
    

class AoLClassifier:
    
    def __init__(self, embeddings_matrix=(30000, 300), max_len=500, n_classes=10, hidden_dim=128):
        
        self._max_len = max_len
        self._load(hidden_dim, n_classes, embeddings_matrix)
    
    def _load(self, hidden_dim, n_classes, embeddings_matrix):
        """Load tokenizers and model."""
        
        # ----- Load tokenizers and create word indices ----- #
        self.tokenizer = pickle.load(open("tokenizers/tokenizer.pkl", "rb"))
        self.word_index = self.tokenizer.word_index
        
        # ----- Load embeddings ----- #
        #_, embeddings_matrix = get_embeddings("../embeddings/", self.word_index)
    
        # ----- Load model and weights ----- #
        self.model = ClassifierModel(embeddings_matrix, n_classes, hidden_dim=hidden_dim)
        self.model.load_weights(os.path.join("models", "classifier_" + "bidirectional_20epochs"))
        
    def _preprocess(self, text):
        """A very basic function to preprocess the texts. Needs to be imporved."""
          
        
        # Tokenize, pad and convert to tensor
        inputs = self.tokenizer.texts_to_sequences([text])
        inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, 
                                                           maxlen=self._max_len, 
                                                           padding='post') 
        inputs = tf.convert_to_tensor(inputs, dtype=tf.int32)
        return inputs
    
    def classify_text(self, raw_text):
        """Generates the title given the supplied raw text."""
        
        # Preprocess question
        processed_text = self._preprocess(raw_text)
        # Make predictions and return
        probs = self.model(processed_text, probs=True).numpy()
        return probs

    
def run_epoch(dataset, model, optimizer, p_bar):
    """Iteares onces over the entire dataset."""

    for X_batch, y_batch in dataset:
        
        p_bar.update(1)
        with tf.GradientTape() as tape:

            # Make predictions
            y_pred, _ = model(X_batch, probs=False)

            # Compute loss
            loss = model.compute_loss(y_batch, y_pred)

            # Compute gradients
            gradients = tape.gradient(loss, model.trainable_variables)

            # Apply gradients
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        # Some verbose
        p_bar.set_postfix_str("Training loss: {0:.3f}".format(loss.numpy()))

In [8]:
REL_PATH = 'classifier'
MAX_VOCAB_SIZE = 30000
MAX_SEQUENCE_LENGTH = 500
EMBEDDING_DIM = 300
LEARNING_RATE = 1e-3
BATCH_SIZE = 128
EPOCHS = 20

In [9]:
X, y, categories = load_data("../Dataset/")

loading the data ... 
current categories: ['Arbeitsrecht', 'Erbrecht', 'Familienrecht', 'Kaufrecht', 'Mietrecht _ Wohnungseigentum', 'Oeffentlichesrecht', 'Sozialversicherungsrecht', 'Steuerrecht', 'Strafrecht', 'Vertragsrecht']


In [10]:
X_train, X_test, y_train, y_test, word2idx = pre_process(X, y)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(99000, 500)
(1000, 500)
(99000, 10)
(1000, 10)


In [17]:
embeddings_index, embeddings_matrix = get_embeddings("../embeddings/", word2idx)

Loading word vectors...
Creating the emebdding matrix


In [18]:
# Convert the inputs to a Dataset
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
dataset = dataset.shuffle(X_train.shape[0]).batch(BATCH_SIZE)

In [35]:
model = ClassifierModel(embeddings_matrix, len(categories))
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)

In [37]:
# Save 
model.save_weights(os.path.join("models", "classifier_" + "bidirectional_20epochs"))

In [55]:
test = AoLClassifier()

In [56]:
test.classify_text("Hallo, mein Name is Mohhammed")

array([[0.04171598, 0.11093291, 0.07323458, 0.08582268, 0.03053827,
        0.16059865, 0.15863264, 0.10529935, 0.1893417 , 0.04388329]],
      dtype=float32)

In [12]:
class AttentionClassifierModel(tf.keras.Model):
    
    def __init__(self, embeddings_matrix, n_classes, max_len=500, hidden_dim=128):
        super(AttentionClassifierModel, self).__init__()
        
        # Constuct from existing embeddings matrix
        if type(embeddings_matrix) is np.ndarray:
            self.embedding = tf.keras.layers.Embedding(
                                embeddings_matrix.shape[0],
                                embeddings_matrix.shape[1],
                                weights=[embeddings_matrix],
                                input_length=max_len,
                                trainable=True)
        
        # Construct a new embedings matrix
        # Embeddings matrix is just a tuple with vocab size and embedding_dim
        else:
            self.embedding = tf.keras.layers.Embedding(embeddings_matrix[0], embeddings_matrix[1])
        self.lstm = tf.keras.layers.Bidirectional(
                                        tf.keras.layers.CuDNNLSTM(hidden_dim, 
                                                        return_state=True,
                                                        return_sequences=True))
        self.attention = BahdanauAttention(hidden_dim)
        self.dense = tf.keras.layers.Dense(64, activation='elu')
        self.cls = tf.keras.layers.Dense(n_classes)

        
    def call(self, inputs, probs=False):
        
        X = self.embedding(inputs)
        X = self.lstm(X)
        output = X[0]
        hiddens = tf.concat(X[1:], axis=-1)
        context, attention_weights = self.attention(hiddens, output)
        X = self.dense(context)
        logits = self.cls(X)
        if probs:
            return tf.nn.softmax(logits, axis=-1), attention_weights
        return logits, attention_weights
    
    def compute_loss(self, y_true, y_pred_logits):
        """Computes the loss between predicted and true labels."""
        
        return tf.losses.softmax_cross_entropy(y_true, y_pred_logits)
    
    def score(self, X, y):
        """Computes and returns the confusion matrix and accuracy score on given validation X and y"""
        
        # Make sure X is tensor
        if type(X) is np.ndarray:
            X = tf.convert_to_tensor(X, dtype=tf.float32)

        y_pred, _ = self.call(X, probs=True)
        y_pred = y_pred.numpy()
        
        # Makse sure y is a numpy array (dumb sklearn uses len())
        if type(y) is not np.ndarray:
            y = y.numpy()
        acc = accuracy_score(np.argmax(y, axis=1), np.argmax(y_pred, axis=1))
        cm = confusion_matrix(np.argmax(y, axis=1), np.argmax(y_pred, axis=1))
        return acc, cm
    


        processed_text = self._preprocess(raw_text)
        # Make predictions and return
        probs = self.model(processed_text, probs=True).numpy()
        return probs

    
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, hidden_size)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights
    
    
class AttentionAoLClassifier:
    
    def __init__(self, embeddings_matrix=(30000, 300), max_len=500, n_classes=10, hidden_dim=128):
        
        self._max_len = max_len
        self._load(hidden_dim, n_classes, embeddings_matrix)
    
    def _load(self, hidden_dim, n_classes, embeddings_matrix):
        """Load tokenizers and model."""
        
        # ----- Load tokenizers and create word indices ----- #
        self.tokenizer = pickle.load(open("tokenizers/tokenizer.pkl", "rb"))
        self.word_index = self.tokenizer.word_index
        
        # ----- Load embeddings ----- #
        #_, embeddings_matrix = get_embeddings("../embeddings/", self.word_index)
    
        # ----- Load model and weights ----- #
        self.model = AttentionClassifierModel(embeddings_matrix, n_classes, hidden_dim=hidden_dim)
        self.model.load_weights(os.path.join("models", "classifier_attention_" + "bidirectional_15epochs"))
        
    def _preprocess(self, text):
        """A very basic function to preprocess the texts. Needs to be imporved."""
          
        
        # Tokenize, pad and convert to tensor
        inputs = self.tokenizer.texts_to_sequences([text])
        inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, 
                                                           maxlen=self._max_len, 
                                                           padding='post') 
        inputs = tf.convert_to_tensor(inputs, dtype=tf.int32)
        return inputs
    
    def classify_text(self, raw_text):
        """Generates the title given the supplied raw text."""
        
        # Preprocess question
        processed_text = self._preprocess(raw_text)
        # Make predictions and return
        probs, attn_weights = self.model(processed_text, probs=True)
        probs, attn_weights = probs.numpy(), attn_weights.numpy()
        return probs, attn_weights

In [61]:
model = ClassifierModel(embeddings_matrix, len(categories))
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)

## Train for 5 epochs

In [79]:
# %%time
for epoch in range(1, 6):
    
    # Show a nice progressbar during epoch
    with tqdm(total=X_train.shape[0] // BATCH_SIZE + 1, desc='Epoch {}'.format(epoch)) as p_bar:
        
        # Run single epoch
        run_epoch(dataset, model, optimizer, p_bar)
        
    # Validations score after epoch end
    acc, cm = model.score(X_test, y_test)
    print('Epoch: {}, accuracy: {}'.format(epoch, acc))
    print('Confusion matrix: ', cm)

In [80]:
# Save 
model.save_weights(os.path.join("models", "classifier_attention_" + "bidirectional_15epochs"))

In [81]:
model.score(X_test, y_test)

(0.776, array([[86,  1,  1,  1,  2,  4,  6,  3,  2,  4],
        [ 1, 90,  2,  0,  5,  0,  3,  2,  0,  2],
        [ 0,  5, 71,  1,  2,  7,  5,  2,  2,  0],
        [ 0,  1,  2, 80,  1,  0,  1,  3,  5,  6],
        [ 0,  2,  0,  1, 88,  0,  2,  1,  0,  5],
        [ 1,  0,  1,  2,  2, 74,  2,  2,  6,  3],
        [ 5,  0,  6,  2,  2,  3, 81,  2,  1,  2],
        [ 2,  5,  1,  3,  3,  8,  1, 71,  1,  0],
        [ 2,  1,  2,  2,  4,  3,  2,  1, 75,  1],
        [ 7,  0,  2, 19,  8,  5,  2,  0,  4, 60]], dtype=int64))

In [97]:
model = AttentionClassifierModel(embeddings_matrix, len(categories))

In [98]:
model.load_weights(os.path.join("models", "classifier_attention_" + "bidirectional_15epochs"))

<tensorflow.python.training.checkpointable.util.CheckpointLoadStatus at 0x2a5e8501470>

In [90]:
model.score(X_test, y_test)

(0.776, array([[86,  1,  1,  1,  2,  4,  6,  3,  2,  4],
        [ 1, 90,  2,  0,  5,  0,  3,  2,  0,  2],
        [ 0,  5, 71,  1,  2,  7,  5,  2,  2,  0],
        [ 0,  1,  2, 80,  1,  0,  1,  3,  5,  6],
        [ 0,  2,  0,  1, 88,  0,  2,  1,  0,  5],
        [ 1,  0,  1,  2,  2, 74,  2,  2,  6,  3],
        [ 5,  0,  6,  2,  2,  3, 81,  2,  1,  2],
        [ 2,  5,  1,  3,  3,  8,  1, 71,  1,  0],
        [ 2,  1,  2,  2,  4,  3,  2,  1, 75,  1],
        [ 7,  0,  2, 19,  8,  5,  2,  0,  4, 60]], dtype=int64))

In [13]:
classifier = AttentionAoLClassifier()

In [17]:
txt = """ich möchte meine wohnung verkaufen ein freund schilderte mir seine eigene bei der der käufer die kaufpreiszahlung um wegen angeblich eingetretenen mängeln zwischen und kaufpreiszahlung jetzt geht die sache vor gericht wie kann man sowas vermeiden etwa durch ein rücktrittsrecht für den verkäufer wegen des kaufpreises"""

In [14]:
classifier.model.score(X_test, y_test)

(0.882, array([[101,   0,   0,   0,   0,   1,   4,   0,   3,   1],
        [  0,  77,   1,   0,   1,   1,   1,   1,   2,   1],
        [  0,   3,  89,   0,   2,   2,   4,   1,   1,   0],
        [  0,   3,   0,  98,   1,   1,   0,   2,   0,   9],
        [  0,   0,   0,   1,  83,   1,   0,   0,   0,   4],
        [  0,   2,   2,   0,   2,  83,   0,   0,   4,   3],
        [  3,   2,   4,   0,   0,   0,  93,   1,   1,   0],
        [  1,   2,   3,   0,   0,   3,   1, 103,   0,   1],
        [  1,   0,   0,   0,   0,   1,   1,   1,  83,   0],
        [  1,   2,   0,  15,   4,   2,   1,   1,   1,  72]], dtype=int64))

In [19]:
classifier.classify_text(txt)[0].argmax()

3