In [None]:
# !pip install -U nltk
# import nltk
# nltk.download('stopwords')

In [1]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

print(tf.__version__)

2.1.0


In [24]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [3]:
data_root_path = './data/'
skills_detection = []
skills_count = 1

skills_data_files = ['customer_care_intents.csv', 'insurance_intents.csv', 'mortgage_intents.csv']
skills_keys = {1: 'customer_care', 2: 'insurance', 3: 'mortgage'}

# skills_detection.append(['intent','text'])
for skills_file in skills_data_files:
    with open(data_root_path + '/' +skills_file, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        skill_name = skills_keys[skills_count]
        next(reader)
        for row in reader:
            skills_detection.append([skill_name, row[0]])
    skills_count = skills_count + 1
        


In [4]:
import pandas as pd

df = pd.DataFrame(skills_detection, columns = ['intent', 'text'])
df = df.sample(n=len(df), random_state=42)
df.head()

Unnamed: 0,intent,text
703,mortgage,should i refinance my mortgage
33,customer_care,I would like to add my son to my account. I wa...
300,customer_care,Can you help me find locations in the general ...
456,insurance,What's the lowest price for the insurance?
633,mortgage,i want a mortgage broker to call me


In [5]:
df.to_csv(data_root_path + '/skills_detection_data.csv', index = False)

In [6]:
articles = []
labels = []

with open(data_root_path + '/skills_detection_data.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)
print(len(labels))
print(len(articles))


731
731


In [7]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_articles))
print(len(train_labels))
print(len(validation_articles))
print(len(validation_labels))

print(train_labels[0] + ' : ' +train_articles[0])

584
584
584
147
147
mortgage : should refinance mortgage


In [8]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [9]:
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'i': 2,
 'what': 3,
 'how': 4,
 'mortgage': 5,
 'can': 6,
 'account': 7,
 'insurance': 8,
 'claim': 9,
 'policy': 10}

In [10]:
train_sequences = tokenizer.texts_to_sequences(train_articles)

In [11]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [12]:
print(len(train_sequences))
print(len(train_padded))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

584
584
16
200
5
200


In [13]:
print(train_articles[10])
print(train_sequences[0])
print(train_padded[0])

what information need apply mortgage
[173, 36, 5]
[173  36   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


In [14]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

147
(147, 200)


In [15]:
print(set(labels))
print(len(set(labels)))


{'insurance', 'mortgage', 'customer_care'}
3


In [16]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [17]:
print(training_label_seq[0])
print(training_label_seq.shape)

print(validation_label_seq[0])
print(validation_label_seq.shape)

print(np.asarray(train_padded[0]).shape)

[4]
(584,)
[4]
(147,)
(200,)


In [18]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_article(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('---')
print(train_articles[10])

what information need apply mortgage ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
---
what information need apply mortgage


In [19]:
print(type(train_padded))
print(type(training_label_seq))
print(train_padded[0])
print(training_label_seq[0])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[173  36   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
[4]


In [None]:
i = 0
train_data = []
for row in train_padded:
    train_data.append([np.asarray(train_padded[i]), np.asarray(training_label_seq[i])])
    i = i + 1
    
validation_data = []
j = 0
for row in validation_padded:
    validation_data.append((validation_padded[j], validation_label_seq[j]))
    j = j + 1

    
training_df = pd.DataFrame(train_data, columns=('intent', 'text'))
training_df.head()

target = training_df.pop('intent')

print(type(target.values.tolist()))

dataset = tf.data.Dataset.from_tensor_slices((training_df.values, target.values.tolist()))

# print(training_df['text'].values)

# training_dataset = (
#     tf.data.Dataset.from_tensor_slices(
#         (
#             tf.cast(training_df.values, tf.float32),
#             tf.cast(target.values, tf.float32)
#         )
#     )
# )

# target = training.pop('intent')
# print(type(training))
# print(type(target))
# validation = pd.DataFrame(validation_data, columns=('intent', 'text'))

# training =  np.array(train_data)
# validation =  np.array(validation_data)

# dataset = tf.data.Dataset.from_tensor_slices((training.values, target.values))


In [None]:
BUFFER_SIZE = 1000

train_batches = (
    training
    .shuffle(BUFFER_SIZE)
    .padded_batch(32))

test_batches = (
    validation
    .padded_batch(32))

In [None]:
def create_model():
    model = keras.Sequential([
    keras.layers.Embedding(encoder.vocab_size, 16),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(3)])

In [20]:

# model = tf.keras.Sequential([
#     # model.add(Dense(output_dim=8,init ='uniform',activation='relu', input_dim=len(train_x[0])))
#     tf.keras.layers.Dense(8, activation='relu', input_shape=(np.asarray(train_padded[0]).shape)),
#     # model.add(Dense(8, activation='relu', input_dim=(len(train_x))))
#     # model.add(Activation('relu'))
#     # model.add(Dropout(0.3))
#     tf.keras.layers.Dense(8, activation='relu'),
#     # model.add(Activation('relu'))
#     # model.add(Dropout(0.3))
#     tf.keras.layers.Dense(8, activation='relu'),
#     # model.add(Activation('relu'))
#     # model.add(Dropout(0.3))
#     tf.keras.layers.Dense(8, activation='softmax')
#     ])


model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, 64),
#     tf.keras.layers.Dense(64, activation='relu', input_shape=(np.asarray(train_padded[0]).shape)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(len(set(labels)), activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          128000    
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 239,619
Trainable params: 239,619
Non-trainable params: 0
__________________________________________________

In [21]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [30]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(train_padded, tf.float32),
            tf.cast(training_label_seq, tf.int32)
        )
    )
)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [29]:
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)
# history = model.fit(train_padded, training_label_seq, epochs=num_epochs, verbose=2)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
import tensorflowjs as tfjs
tfjs_target_dir = './results/model/custom_nlc_tf2'
tfjs.converters.save_keras_model(model, tfjs_target_dir)

In [None]:
txt = ["cole faces lengthy injury lay-off aston villa s carlton cole could be out for six weeks with a knee injury.  the striker  who is on a season-long loan from chelsea  picked up the knock in an england under-21 match against holland earlier this month.  carlton will be out of action for four to six weeks after a bad challenge   said villa boss david o leary.  i won t be able to tell you whether he will need an operation until maybe next week. whether he has an operation has got to be left to chelsea.  cole  who also struggled with an ankle problem earlier in the season  was unable to rest because o leary had a shortage of strikers. the return to fitness of darius vassell after four months out with a broken ankle and the emergence of luke moore has alleviated some of the villa s manager s problems in that department."]
seq = tokenizer.texts_to_sequences(txt)
print(len(seq[0]))
print(seq[0])
padded = pad_sequences(seq, maxlen=max_length)
print(padded)
pred = model.predict(padded)
print(np.argmax(pred))
# print(set(labels))
labels = ['sport', 'bussiness', 'politics', 'tech', 'entertainment']
print(pred, labels[np.argmax(pred) - 1])

In [None]:
import json
word_index_json = tokenizer.word_index
with open('word_index.json', 'w') as f:
        json.dump(word_index_json, f)