In [248]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import re
import random
import warnings
import tensorflow as tf

from tensorflow import keras as ks
from tensorflow.keras import layers
from keras.models import load_model
from keras import callbacks 

# Preprocessing data

We start by loading the dataset and clean both training and validation data with regex replacements.

In [249]:
def clean_data(text):
    text = text.lower()
    text = re.sub(r'[^(a-zA-Z0-9)\s\*\+-\/\(\)=&|]','', text)
    return text

train = pd.read_csv("data/train.csv")
valid = pd.read_csv("data/valid.csv")

train['Body'] = train['Body'].astype('str')
valid['Body'] = valid['Body'].astype('str')
train['Body'] = train['Body'].apply(clean_data)
valid['Body'] = valid['Body'].apply(clean_data)


# Tokenize data
The Tokenizer is initialized and fitted on the training data. The cleaned columns are then converted to integer vectors, before being padded with zeroes or shortened to an uniform length of 75 words.

In [298]:
maxlength = 150

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n\r', lower=True, split=" ")
tokenizer.fit_on_texts(train['Body'])

train_x = ks.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train['Body']), maxlen=maxlength)
valid_x = ks.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(valid['Body']), maxlen=maxlength)

# One-hot encode labels
The label columns are then converted from classes to to integers, before being encoded as a one-hot matrix representation.

In [299]:
train_x[4]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,  184,   46,  627,
          2,   93,    3,  738, 1587, 1524,   30,  226,  594,  592,  138,
         48,   92,    5,   64,   51, 1528, 1044, 1278, 1844,   27, 2179,
        769, 1102, 4105, 3144,   89,  450,  138, 2475,    5,    4,  317,
         74,   75,  109,    3,   49,   11,    4], d

In [200]:
train_y = train['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT':1, 'HQ':2})
valid_y = valid['Y'].map({'LQ_CLOSE':0, 'LQ_EDIT':1, 'HQ':2})

train_y = ks.utils.to_categorical(train_y, num_classes=3)
valid_y = ks.utils.to_categorical(valid_y, num_classes=3)

# Count word occurences
Print number of word occurences in order to identify an appropriate input dimension.

In [252]:
sortedwordindex = sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)
##print(sortedwordindex)

# Defining the model
Define a Keras Sequential model. The first layer is a word embedding layer that creates vectors between similar or associated words. Two layers of LSTM with half the dimensionality of the embedding layer is then applied before adding a final softmax activation layer with three possible outputs. The model is then compiled with an SGD-optimizer, using categorical crossentropy as loss function, and recording the accuracy as a metric.

In [253]:
model = ks.Sequential()
model.add(layers.Embedding(input_length=maxlength, input_dim=10000, output_dim=128))    
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.LSTM(64))
model.add(layers.Dense(3, activation="softmax"))
model.compile(optimizer=ks.optimizers.SGD(learning_rate=(0.55)), loss=ks.losses.CategoricalCrossentropy(), metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 150, 128)          1280000   
                                                                 
 lstm_4 (LSTM)               (None, 150, 64)           49408     
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_2 (Dense)             (None, 3)                 195       
                                                                 
Total params: 1,362,627
Trainable params: 1,362,627
Non-trainable params: 0
_________________________________________________________________


# Model training
The model is trained with the training data, recording the performance on, but not training on, the validation set after each training epoch. The model is using an EarlyStopping callback in order to terminate when the model has stopped improving (more precisely when the validation loss starts increasing for at least 5 epochs). The model is then tested on the validation set and final loss and accuracy is printed.


In [254]:
earlystopping = callbacks.EarlyStopping(monitor ="val_loss", mode ="min", patience = 5, restore_best_weights = True) 
  
history_callback = model.fit(train_x, train_y, batch_size = 128, epochs = 20, validation_data =(valid_x, valid_y), callbacks =[earlystopping]) 

loss_history = history_callback.history

loss, acc = model.evaluate(valid_x, valid_y, verbose=1)
print('Loss:\t\t', loss, '\nAccuracy:\t', acc)

Epoch 1/20


2022-08-05 15:07:46.419698: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:07:46.648357: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:07:46.884537: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:07:47.127807: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:07:47.605654: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-08-05 15:08:17.001706: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:08:17.077625: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-05 15:08:17.273634: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Loss:		 0.32314735651016235 
Accuracy:	 0.8535333275794983


# Printing metrics
Prints the loss and accuracy values for both training and validation set for each epoch during training.

In [None]:
print("Validation loss:\n")
print(str(loss_history["val_loss"]) + "\n\n")
print("Validation accuracy:\n")
print(str(loss_history["val_accuracy"]) + "\n\n")
print("Training loss:\n")
print(str(loss_history["loss"]) + "\n\n")
print("Training accuracy:\n")
print(str(loss_history["accuracy"]) + "\n\n")

Validation loss:

[0.4700574278831482, 0.4641878008842468, 0.4615233540534973, 0.3943864405155182, 0.36348074674606323, 0.45264384150505066, 0.3628832995891571, 0.3902895450592041, 0.3363644480705261, 0.5575753450393677, 0.35546600818634033, 0.32191628217697144, 0.3116975426673889, 0.29423654079437256, 0.3101593852043152, 0.291271448135376, 0.2691100239753723, 0.26175805926322937, 0.25388067960739136, 0.25590038299560547]


Validation accuracy:

[0.6692000031471252, 0.6665999889373779, 0.7293333411216736, 0.7964000105857849, 0.8364666700363159, 0.6765333414077759, 0.8308666944503784, 0.8060666918754578, 0.8493333458900452, 0.7378000020980835, 0.8322666883468628, 0.8635333180427551, 0.8600000143051147, 0.8721333146095276, 0.8723333477973938, 0.8844666481018066, 0.8881333470344543, 0.8899333477020264, 0.8948667049407959, 0.8941333293914795]


Training loss:

[0.690728485584259, 0.47954124212265015, 0.46791723370552063, 0.4348618984222412, 0.3957420587539673, 0.4712510108947754, 0.3870306

In [None]:
# check if FE token and model token matched

In [542]:
train[train['Y']=='LQ_EDIT'].iloc[2].Body

'i my case i am having two form form 1 and form 2.\r\nform 1 having two buttons and form 2 is having one textbox.\r\non button 1 click event i am writing my text in my form 2 textbox and on button 2 i am showing form 2.\r\nwhat is happening is when i close my form 2 using close x button and reopen it value in my form 2 textbox disappears.\r\nplease help how can i resolve this,\r\n'

In [534]:
train.iloc[9498]

Id                                                       39040929
Title                       how to make a textbox show me double?
Body            i am trying to get myself used to c, currently...
Tags                                           <c#><textbox><var>
CreationDate                                  2016-08-19 13:58:31
Y                                                         LQ_EDIT
Name: 9498, dtype: object

In [535]:
train.iloc[9498].Body

'i am trying to get myself used to c, currently i am using sharpdevelop for that task, anyway, i have a simple question, lets say i want to have a textbox that shows me the outcome of a mathematical code as shown below, how do i get the textbox to actually show me types like double, int or other stuff it always tells me it cant convert double to string or whatever, i am pretty noob so yeah. \r\n\t\t\r\n\r\n    void cmdwriteclick(object sender, eventargs e)\r\n\t   \t\r\n\t\t\tdouble var = 8.40\r\n\t\t\tdouble start = 9.00\r\n\t\t\tdouble end = var + start\r\n\t\t\ttextboxend.text = end\r\n\t\t\r\n\r\ni already tried to not use the .text but something like .value but didnt work, any help \r\n\r\nthanks'

In [495]:
train_x[802]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,  400,   30,   22,  525,    5,   99,  646,    8,
          5, 1345,  297,   54,    1, 4020,   14,  119,  217, 1466,   15,
       4607,  250,    3,    5, 3152,  198, 2255,    1, 1732,  467,   14,
          1, 1345,  346, 1233,    5, 2383,   14,  502,  228,    1,  198,
         19,  805,    8,    1,  109,   30,    1, 2168,   63,    2,   93,
          3,   65,  341,    3,   47, 2445,   14,   19, 1165,    4, 3670,
        217, 1294,   15,  373,    2,  221,  122,   75,  253,    6,   31,
         22,  525,   75,  362,    3,   11,    4], d

In [536]:
result=model.predict([train_x[9498].tolist()])
result

array([[1.9477760e-05, 9.9997973e-01, 7.6478915e-07]], dtype=float32)

In [537]:
np.argmax(result)

1