In [248]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import re
import random
import warnings
import tensorflow as tf

from tensorflow import keras as ks
from tensorflow.keras import layers
from keras.models import load_model
from keras import callbacks 

# Preprocessing data

We start by loading the dataset and clean both training and validation data with regex replacements.

In [249]:
def clean_data(text):
    text = text.lower()
    text = re.sub(r'[^(a-zA-Z0-9)\s\*\+-\/\(\)=&|]','', text)
    return text

train = pd.read_csv("data/train.csv")
valid = pd.read_csv("data/valid.csv")

train['Body'] = train['Body'].astype('str')
valid['Body'] = valid['Body'].astype('str')
train['Body'] = train['Body'].apply(clean_data)
valid['Body'] = valid['Body'].apply(clean_data)


# Tokenize data
The Tokenizer is initialized and fitted on the training data. The cleaned columns are then converted to integer vectors, before being padded with zeroes or shortened to an uniform length of 75 words.

In [251]:
maxlength = 150

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n\r', lower=True, split=" ")
tokenizer.fit_on_texts(train['Body'])

train_x = ks.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['Body']), maxlen=maxlength)
valid_x = ks.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(valid['Body']), maxlen=maxlength)

# One-hot encode labels
The label columns are then converted from classes to to integers, before being encoded as a one-hot matrix representation.

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552974,How to get all the child records from differen...,i am having 4 different tables like \r\nselect...,<sql><sql-server>,2016-01-01 01:44:52,LQ_EDIT
1,34554721,Retrieve all except some data of the another t...,i have two table mmaster and tblappointment\r\...,<php><mysql><sql><codeigniter><mysqli>,2016-01-01 08:43:50,LQ_EDIT
2,34555135,Pandas: read_html,"pim trying to extract us states from wiki url,...",<python><pandas>,2016-01-01 09:55:22,HQ
3,34555448,Reader Always gimme NULL,"im so new to c, i wanna make an application th...",<sql-server><c#-4.0>,2016-01-01 10:43:45,LQ_EDIT
4,34555752,php rearrange array elements based on condition,basically i have this array\r\n\r\n array(\...,<php>,2016-01-01 11:34:09,LQ_EDIT
...,...,...,...,...,...,...
14995,60465681,How can I align two flex boxes to follow each ...,"pi have a menu, and id like the div.right-cont...",<html><css><flexbox>,2020-02-29 13:32:56,LQ_CLOSE
14996,60467932,C++ The correct way to multiply an integer and...,pi try to multiply an integer by a double but ...,<c++>,2020-02-29 17:46:41,LQ_CLOSE
14997,60468378,WHY DJANGO IS SHOWING ME THIS ERROR WHEN I TRY...,*urls.py*\r\n //urls.py file\r\n fro...,<django><django-views><django-templates>,2020-02-29 18:35:39,LQ_EDIT
14998,60469392,PHP - getting the content of php page,pi have a controller inside which a server is ...,<javascript><php><html>,2020-02-29 20:32:14,LQ_CLOSE


In [200]:
train_y = train['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT':1, 'HQ':2})
valid_y = valid['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT':1, 'HQ':2})

train_y = ks.utils.to_categorical(train_y, num_classes=3)
valid_y = ks.utils.to_categorical(valid_y, num_classes=3)

# Count word occurences
Print number of word occurences in order to identify an appropriate input dimension.

In [252]:
sortedwordindex = sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)
##print(sortedwordindex)

# Defining the model
Define a Keras Sequential model. The first layer is a word embedding layer that creates vectors between similar or associated words. Two layers of LSTM with half the dimensionality of the embedding layer is then applied before adding a final softmax activation layer with three possible outputs. The model is then compiled with an SGD-optimizer, using categorical crossentropy as loss function, and recording the accuracy as a metric.

In [253]:
model = ks.Sequential()
model.add(layers.Embedding(input_length=maxlength, input_dim=10000, output_dim=128))    
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.LSTM(64))
model.add(layers.Dense(3, activation="softmax"))
model.compile(optimizer=ks.optimizers.SGD(learning_rate=(0.55)), loss=ks.losses.CategoricalCrossentropy(), metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 150, 128)          1280000   
                                                                 
 lstm_4 (LSTM)               (None, 150, 64)           49408     
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_2 (Dense)             (None, 3)                 195       
                                                                 
Total params: 1,362,627
Trainable params: 1,362,627
Non-trainable params: 0
_________________________________________________________________


# Model training
The model is trained with the training data, recording the performance on, but not training on, the validation set after each training epoch. The model is using an EarlyStopping callback in order to terminate when the model has stopped improving (more precisely when the validation loss starts increasing for at least 5 epochs). The model is then tested on the validation set and final loss and accuracy is printed.


In [254]:
earlystopping = callbacks.EarlyStopping(monitor ="val_loss", mode ="min", patience = 5, restore_best_weights = True) 
  
history_callback = model.fit(train_x, train_y, batch_size = 128, epochs = 20, validation_data =(valid_x, valid_y), callbacks =[earlystopping]) 

loss_history = history_callback.history

loss, acc = model.evaluate(valid_x, valid_y, verbose=1)
print('Loss:\t\t', loss, '\nAccuracy:\t', acc)

Epoch 1/20


2022-08-05 15:07:46.419698: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:07:46.648357: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:07:46.884537: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:07:47.127807: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:07:47.605654: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-08-05 15:08:17.001706: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:08:17.077625: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:08:17.273634: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Loss:		 0.32314735651016235 
Accuracy:	 0.8535333275794983


# Printing metrics
Prints the loss and accuracy values for both training and validation set for each epoch during training.

In [None]:
print("Validation loss:\n")
print(str(loss_history["val_loss"]) + "\n\n")
print("Validation accuracy:\n")
print(str(loss_history["val_accuracy"]) + "\n\n")
print("Training loss:\n")
print(str(loss_history["loss"]) + "\n\n")
print("Training accuracy:\n")
print(str(loss_history["accuracy"]) + "\n\n")

Validation loss:

[0.4700574278831482, 0.4641878008842468, 0.4615233540534973, 0.3943864405155182, 0.36348074674606323, 0.45264384150505066, 0.3628832995891571, 0.3902895450592041, 0.3363644480705261, 0.5575753450393677, 0.35546600818634033, 0.32191628217697144, 0.3116975426673889, 0.29423654079437256, 0.3101593852043152, 0.291271448135376, 0.2691100239753723, 0.26175805926322937, 0.25388067960739136, 0.25590038299560547]


Validation accuracy:

[0.6692000031471252, 0.6665999889373779, 0.7293333411216736, 0.7964000105857849, 0.8364666700363159, 0.6765333414077759, 0.8308666944503784, 0.8060666918754578, 0.8493333458900452, 0.7378000020980835, 0.8322666883468628, 0.8635333180427551, 0.8600000143051147, 0.8721333146095276, 0.8723333477973938, 0.8844666481018066, 0.8881333470344543, 0.8899333477020264, 0.8948667049407959, 0.8941333293914795]


Training loss:

[0.690728485584259, 0.47954124212265015, 0.46791723370552063, 0.4348618984222412, 0.3957420587539673, 0.4712510108947754, 0.3870306

In [None]:
# check if FE token and model token matched

In [218]:
valid_test

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,1,test,pi just enabled instant run for my project. bu...,,,
1,34552974,How to get all the child records from differen...,I am having 4 different tables like \r\nselect...,<sql><sql-server>,01/01/2016 1:44,LQ_EDIT
2,34554721,Retrieve all except some data of the another t...,I have two table m_master and tbl_appointment\...,<php><mysql><sql><codeigniter><mysqli>,01/01/2016 8:43,LQ_EDIT
3,34555135,Pandas: read_html,<p>I'm trying to extract US states from wiki U...,<python><pandas>,01/01/2016 9:55,HQ
4,34555448,Reader Always gimme NULL,"I'm so new to C#, I wanna make an application ...",<sql-server><c#-4.0>,01/01/2016 10:43,LQ_EDIT
...,...,...,...,...,...,...
14996,60465681,How can I align two flex boxes to follow each ...,"<p>I have a menu, and I'd like the div.right-c...",<html><css><flexbox>,29/02/2020 13:32,LQ_CLOSE
14997,60467932,C++ The correct way to multiply an integer and...,<p>I try to multiply an integer by a double bu...,<c++>,29/02/2020 17:46,LQ_CLOSE
14998,60468378,WHY DJANGO IS SHOWING ME THIS ERROR WHEN I TRY...,*URLS.PY*\r\n //URLS.PY FILE\r\n fro...,<django><django-views><django-templates>,29/02/2020 18:35,LQ_EDIT
14999,60469392,PHP - getting the content of php page,<p>I have a controller inside which a server i...,<javascript><php><html>,29/02/2020 20:32,LQ_CLOSE
