This script develops a Deep learning based on combination of structured fields and unstructured text fields for a conditional hierarchical classification model.

In [None]:
import OS import random
import pandis as pd 
import numpy as np
import matplotlib.pyplot as plt 
plt.style.use("ggplot")
%matplotlib inline

from tqdm import tqdm_notebook, tnrange 
from itertools import chain
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.models import Model, load_model
from keras.layers import Input, BatchNormalization, Activation, Dense,Dropout, Reshape, UpSampling2D 
from keras.layers.core import Lambda, RepeatVector, Reshape
from keras.layers.convolutional import Conv1D

from keras.layers.pooling import MaxPoolinglD 
from keras.layers.merge import concatenate, add
from keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau 
from keras.optiaizers import Adam
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img 
from keras.layers import Dense
from keras.layers import Flatten 
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.regularizers import 12

tf. device("gpu:1")


Here we have two unstructured texts input and one structured field input

In [None]:
def Unstructured_structured_net(unstructured_text1,unstructured_text2, structured, length_unstructured1,length_unstructured2,vocab_size_unstructured1,vocab_size_unstructured2):
  #encoder unstructured_text1

  #channel 1 (Here we will first consider kernel size 2, i.e, consider embeddings for 2 word phrases)
  embedding1 = Embedding(vocab_size_unstructured1, 200)(unstructured_text1) ## 200 represents embedding size for each word
  conv1= Conv1D(filters=32, kernel_size=2, activation='relu')(embedding1) ## kernel_size=2 means we consider embedding for 2-word phrases
  drop1= Dropout(0.2)(conv1)
  pool1= MaxPooling1D(pool_size=2)(drop1)
  seq1= Bidirectional(LSTM(20, return_sequences=True), input_shape=(length_unstructured1-1,32))(pool1) ### sequential meaning from previous and future sentences
  seq2= Bidirectional(LSTM(20, return_sequences=True))(seq1)
  flat1= Flatten()(seq2)

  #channel 2 (Here we will first consider kernel size 3, i.e, consider embeddings for 3 word phrases)
  embedding2 = Embedding(vocab_size_unstructured1, 200)(unstructured_text1) ## 200 represents embedding size for each word
  conv2= Conv1D(filters=32, kernel_size=3, activation='relu')(embedding2) ## kernel_size=2 means we consider embedding for 2-word phrases
  drop2= Dropout(0.2)(conv2)
  pool2= MaxPooling1D(pool_size=2)(drop2)
  seq3= Bidirectional(LSTM(20, return_sequences=True), input_shape=(length_unstructured2-2,32))(pool2) ### sequential meaning from previous and future sentences
  seq4= Bidirectional(LSTM(20, return_sequences=True))(seq3)
  flat2= Flatten()(seq4) 

  ### we could increase channels with larger kernel_size if we want meaning-embeddings for higher word phrases

  ### merge the channel 1 and channel 2
  merged= concatenate([flat1, flat2])
  ### interpretation
  dense1= Dense(2000, activation='relu', kernel_regularizer=l2(0.001))(merged)
  dense2= Dense(200, activation='relu')(dense1)

  #encoder unstructured_text2
  #In our case unstructured_text2 sequence length is much lesser than unstructured_text1. so we only take 2 word-phrases.
  embedding3 = Embedding(vocab_size_unstructured2, 50)(unstructured_text2) ## 50 represents embedding size for each word
  conv3= Conv1D(filters=16, kernel_size=2, activation='relu')(embedding3) ## kernel_size=2 means we consider embedding for 2-word phrases
  drop3= Dropout(0.2)(conv3)
  pool3= MaxPooling1D(pool_size=2)(drop3)
  flat3= Flatten()(pool3)
  dense3= Dense(50, activation='relu')(flat3)

  ## here we merge dense1(unstructured_text1), dense2(unstructured_text1), structured field)
  u7=concatenate([dense2,dense3,structured],axis=1)
  u7=Reshape(1,200+50+14)(u7)# 14 is the number of structured fields

  ### decoder path (independent output)
  f1= Flatten()(u7)
  d1= dense(100, activation='relu')(f1)
  drop4= Dropout(0.2)(d1)
  d2= dense(10, activation='relu')(drop4)
  drop5= Dropout(0.2)(d2)
  output1= Dense(1, activation='sigmoid', name='independent_output')(drop5)

  ### decoder path (dependent output)
  f2= Flatten()(u7)
  d3= dense(100, activation='relu')(f2)
  p11= concatenate([d1,d3])
  drop6= Dropout(0.2)(p11)
  d4= dense(10, activation='relu')(drop6)
  p12= concatenate([d2,d4])
  drop7= Dropout(0.2)(p12)
  output2= Dense(3, activation='sigmoid', name='dependent_output')(drop7) ## multiclass dependent output (not mutually exclusive hence sigmoid activation)

  model= Model(inputs=[unstructured_text1,unstructured_text2, structured], outputs=[output1,output2])
  return model


In [None]:
#### define loss function

from keras import backend as K
def weighted_categorical_crossentropy(weights):
    """
    https://gist.github.com/wassname/ce364fddfc8a025bfab4348cf5de852d

    A weighted version of keras.objectives.categorical_crossentropy
    
    Variables:
        weights: numpy array of shape (C,) where C is the number of classes
    
    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """
    
    weights = K.variable(weights)
        
    def loss(y_true, y_pred):
        # scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        # clip to prevent NaN's and Inf's
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # calc
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss
    
    return loss


def weighted_binary_crossentropy(zero_weight, one_weight):

    def weighted_binary_crossentropy(y_true, y_pred):

        # Original binary crossentropy (see losses.py):
        # K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)

        # Calculate the binary crossentropy
        b_ce = K.binary_crossentropy(y_true, y_pred)

        # Apply the weights
        weight_vector = y_true * one_weight + (1. - y_true) * zero_weight
        weighted_b_ce = weight_vector * b_ce

        # Return the mean error
        return K.mean(weighted_b_ce)

    return weighted_binary_crossentropy

In [None]:
input1 = Input((length_unstructured1,), name='unstructured_text1')
input2 = Input((length_unstructured2,), name='unstructured_text2')
input3 = Input((14,), name='structured_fields')
model = Unstructured_structured_net(input1,input2,input3, length_unstructured1,length_unstructured2,vocab_size_unstructured1,vocab_size_unstructured2)

weights1=np.array([1,1]) ### this weights each output of independent-output section equally
weights2=np.array([10,5,1]) ### this weights are according to priority for dependent-output
model.compile(loss={'independent_output': weighted_binary_crossentropy(weights1[0],weights1[1]), 
                    'dependent_output': weighted_categorical_crossentropy(weights2)},
              loss_weights={'independent_output': 1.0,
                            'dependent_output': 1.0},
              optimizer='adam',
              metrics={'independent_output': 'accuracy', 'dependent_output': 'accuracy'})

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss',mode='min',patience=15, verbose=1),
    ReduceLROnPlateau(factor=0.1, patience=5, min_lr=0.00001, verbose=1),
    ModelCheckpoint('unstructured_structured_deep.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True)
]

In [None]:
results = model.fit([unstructured_text1_train,unstructured_text2_train,sructured_train], {'independent_output': outputs1_train, 'dependent_output': outputs2_train}, batch_size=64, epochs=150, callbacks=callbacks,\
                    validation_data=([unstructured_text1_valid,unstructured_text2_valid,sructured_valid], {'independent_output': outputs1_valid, 'dependent_output': outputs2_valid}))