In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/MyDrive/Colab\ Notebooks/Question\ Tagging

Mounted at /gdrive
/gdrive/MyDrive/Colab Notebooks/Question Tagging


In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import OneCycleLR
from torchtext.vocab import Vocab
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
import pickle
import re

In [3]:
num_workers = 2
SEQ_LEN = 128
BATCH_SIZE = 32

## Prepare Data (Train/Test Split)

In [4]:
dataset = pd.read_csv('processed_data.csv')
dataset.head()

Unnamed: 0,Body,Tags
0,asp.net site maps has anyone got experience cr...,"sql,asp.net"
1,adding scripting functionality to .net applica...,"c#,.net"
2,should i use nested classes in this case? i am...,c++
3,homegrown consumption of web services i've bee...,.net
4,automatically update version number i would li...,c#


In [5]:
dataset = dataset.sample(frac=1, random_state=7).reset_index(drop=True)
dataset.head()

Unnamed: 0,Body,Tags
0,how to show a part of a string that starts wit...,php
1,leaflet - add features to a json object and pu...,javascript
2,browser javascript compliant system level js e...,javascript
3,need help mapping this data in java i am havin...,java
4,itextsharp measure chunk width / height i am t...,c#


In [6]:
# Taking 20% as validation set
split_idx = int(dataset.shape[0]*0.8)
train_df = dataset.iloc[:split_idx]
val_df = dataset.iloc[split_idx:]

train_df.shape, val_df.shape

((680790, 2), (170198, 2))

## Create Vocabulary

In [7]:
def get_tag_list(tags):
    return tags.split(",")

In [8]:
def tokenize(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text.split()

In [9]:
counter = Counter()
for body in train_df['Body'].values:
    counter.update(tokenize(body))
vocab = Vocab(counter=counter, max_size=10000, min_freq=3)

In [10]:
# test vocab
print([vocab[token] for token in ['let', 'us', 'do', 'a', 'test']])
print(vocab.itos[:10])

[453, 936, 57, 7, 139]
['<unk>', '<pad>', 'the', 'gt', 'i', 'lt', 'to', 'a', 'is', 'in']


In [11]:
PAD_IDX = vocab['<pad>']
PAD_IDX

1

In [12]:
train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)

del dataset, train_df, val_df

## Dataset

In [13]:
class StackDataset(Dataset):
    def __init__(self, dataset, tags_pkl, seq_len):
        with open(tags_pkl, 'rb') as f:
            all_tags = pickle.load(f)
        
        self.dataset = pd.read_csv(dataset)
        self.mlb = MultiLabelBinarizer()
        self.mlb.fit([all_tags])
        self.seq_len = seq_len
    
    def __getitem__(self, index):
        body, tags = self.dataset.iloc[index]

        num_body_list = [vocab[token] for token in tokenize(body)][:self.seq_len]
        num_body_list += [PAD_IDX]*(SEQ_LEN - len(num_body_list))  # Better alternative over pad_sequence
        one_hot_tags = self.mlb.transform([get_tag_list(tags)])

        return torch.tensor(num_body_list), torch.FloatTensor(one_hot_tags).squeeze()  # Added squeeze as mlb.transform -> [[0, 0, ..., 1]]
    
    def __len__(self):
        return self.dataset.shape[0]

In [14]:
# def collate_fn(batch_data):
#     x, y = [], []
#     for body_list, one_hot_tags in batch_data:
#         x.append(body_list)
#         y.append(one_hot_tags)
#     x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
#     y = torch.cat(y, dim=0)

#     return x, y

In [15]:
train_ds = StackDataset(dataset='train.csv', tags_pkl='top_k_tags.pkl', seq_len=SEQ_LEN)
val_ds = StackDataset(dataset='val.csv', tags_pkl='top_k_tags.pkl', seq_len=SEQ_LEN)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE*4, num_workers=num_workers)

## Models

In [16]:
class RNNModel(nn.Module):
    def __init__(self, vocab_len, emb_size):
        super(RNNModel, self).__init__()
        self.emb_layer = nn.Embedding(num_embeddings=vocab_len, embedding_dim=emb_size, padding_idx=PAD_IDX)
        self.rnn_layer = nn.RNN(input_size=emb_size, hidden_size=24, batch_first=True)
        self.linear_layer = nn.Sequential(
            nn.Linear(in_features=3072, out_features=100),
            nn.ReLU(),
            nn.Linear(in_features=100, out_features=20)
        )
    
    def forward(self, x):
        bs = x.shape[0]

        out = self.emb_layer(x)
        hidden_states, last_hidden_state = self.rnn_layer(out)
        out = self.linear_layer(hidden_states.reshape(bs, -1))

        return out

In [17]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_len, emb_size):
        super(LSTMModel, self).__init__()
        self.emb_layer = nn.Embedding(num_embeddings=vocab_len, embedding_dim=emb_size, padding_idx=PAD_IDX)
        self.lstm_layer = nn.LSTM(input_size=emb_size, hidden_size=24, batch_first=True)
        self.linear_layer = nn.Sequential(
            nn.Linear(in_features=3072, out_features=100),
            nn.ReLU(),
            nn.Linear(in_features=100, out_features=20)
        )
    
    def forward(self, x):
        bs = x.shape[0]

        out = self.emb_layer(x)
        hidden_states, (last_hidden_state, last_cell_state) = self.lstm_layer(out)
        out = self.linear_layer(hidden_states.reshape(bs, -1))

        return out

## Training

In [18]:
def fit(model, epochs, lr):
    loss_fn = nn.BCEWithLogitsLoss()
    opt = Adam(model.parameters(), lr=lr)
    scd = OneCycleLR(optimizer=opt, max_lr=lr, epochs=epochs, steps_per_epoch=len(train_dl))

    best_loss = None
    for epoch in range(epochs):
        epoch_train_loss = torch.empty(0, device=torch.device('cuda'))
        epoch_val_loss = torch.empty(0, device=torch.device('cuda'))

        model.train()
        for xb, yb in train_dl:
            xb = xb.to('cuda')
            yb = yb.to('cuda')

            y_hat = model(xb)
            loss = loss_fn(y_hat, yb)
            
            opt.zero_grad()
            loss.backward()
            opt.step()
            scd.step()

            epoch_train_loss = torch.cat((epoch_train_loss, loss.detach().view(1, -1)), dim=0)
        epoch_train_loss = torch.mean(epoch_train_loss).item()
        
        model.eval()
        with torch.no_grad():
            for xb, yb in val_dl:
                xb = xb.to('cuda')
                yb = yb.to('cuda')

                y_hat = model(xb)
                loss = loss_fn(y_hat, yb)

                epoch_val_loss = torch.cat((epoch_val_loss, loss.detach().view(1, -1)), dim=0)
            epoch_val_loss = torch.mean(epoch_val_loss).item()
        
        print(f'Epoch - {epoch+1} | Training Loss - {epoch_train_loss} | Validation Loss - {epoch_val_loss}')

        if best_loss is None or best_loss >= epoch_val_loss:
            best_loss = epoch_val_loss
            torch.save(model.state_dict(), 'bestmodel.pth')
            print('##### Model Saved #####')

In [19]:
epochs = 15
lr = 1e-02

rnn_model = RNNModel(len(vocab), emb_size=50).to('cuda')
fit(rnn_model, epochs, lr)
rnn_model.load_state_dict(torch.load('bestmodel.pth'))

Epoch - 1 | Training Loss - 0.11682187020778656 | Validation Loss - 0.09313584864139557
##### Model Saved #####
Epoch - 2 | Training Loss - 0.09462526440620422 | Validation Loss - 0.0988205224275589
Epoch - 3 | Training Loss - 0.11513382941484451 | Validation Loss - 0.13152840733528137
Epoch - 4 | Training Loss - 0.1280064582824707 | Validation Loss - 0.14458714425563812
Epoch - 5 | Training Loss - 0.13716241717338562 | Validation Loss - 0.14263316988945007
Epoch - 6 | Training Loss - 0.1368151158094406 | Validation Loss - 0.14493189752101898
Epoch - 7 | Training Loss - 0.13543705642223358 | Validation Loss - 0.12936940789222717
Epoch - 8 | Training Loss - 0.1304890662431717 | Validation Loss - 0.12918107211589813
Epoch - 9 | Training Loss - 0.1218632161617279 | Validation Loss - 0.11735323816537857
Epoch - 10 | Training Loss - 0.11502587050199509 | Validation Loss - 0.11444493383169174
Epoch - 11 | Training Loss - 0.10823406279087067 | Validation Loss - 0.10542134195566177
Epoch - 12 

<All keys matched successfully>

In [20]:
epochs = 15
lr = 1e-02

lstm_model = LSTMModel(len(vocab), emb_size=50).to('cuda')
fit(lstm_model, epochs, lr)
lstm_model.load_state_dict(torch.load('bestmodel.pth'))

Epoch - 1 | Training Loss - 0.09844035655260086 | Validation Loss - 0.0753697007894516
##### Model Saved #####
Epoch - 2 | Training Loss - 0.0732794925570488 | Validation Loss - 0.07330222427845001
##### Model Saved #####
Epoch - 3 | Training Loss - 0.07142305374145508 | Validation Loss - 0.07137303054332733
##### Model Saved #####
Epoch - 4 | Training Loss - 0.07201459258794785 | Validation Loss - 0.07366851717233658
Epoch - 5 | Training Loss - 0.07495749741792679 | Validation Loss - 0.07398997992277145
Epoch - 6 | Training Loss - 0.07575125992298126 | Validation Loss - 0.07630888372659683
Epoch - 7 | Training Loss - 0.6256407499313354 | Validation Loss - 0.2548969089984894
Epoch - 8 | Training Loss - 0.22388818860054016 | Validation Loss - 0.21727341413497925
Epoch - 9 | Training Loss - 0.20260865986347198 | Validation Loss - 0.19688794016838074
Epoch - 10 | Training Loss - 0.19538144767284393 | Validation Loss - 0.19625239074230194
Epoch - 11 | Training Loss - 0.19512306153774261 | 

<All keys matched successfully>

## Evaluate Model

In [21]:
def get_pred(model, dl, threshold):
    y_hat = torch.empty(0)
    y_true = torch.empty(0)
    model.eval()
    with torch.no_grad():
        for xb, yb in dl:
            y_true = torch.cat([y_true, yb], dim=0)

            xb = xb.to('cuda')
            yb = yb.to('cuda')

            y_hat = torch.cat([y_hat, torch.where(nn.Sigmoid()(model(xb)) >= threshold, 1, 0).cpu()], dim=0)
    return y_hat.numpy(), y_true.numpy() # shape - (x, 20), where x is number of rows

In [22]:
def get_optimal_threshold(model):
    thresholds = np.arange(0.1, 1, 0.1)
    scores = []

    for thres in thresholds:
        y_pred, y_true = get_pred(model, val_dl, threshold=thres)

        y_pred = y_pred.ravel()
        y_true = y_true.ravel()

        scores.append(f1_score(y_true, y_pred))

    opt = thresholds[scores.index(max(scores))]
    return opt

In [23]:
opt_rnn = get_optimal_threshold(rnn_model)
print(opt_rnn)

0.4


In [24]:
y_pred, y_true = get_pred(rnn_model, val_dl, threshold=opt_rnn)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.40      0.09      0.15      4836
           1       0.93      0.86      0.90     17969
           2       0.93      0.75      0.83      4036
           3       0.72      0.51      0.60      5886
           4       0.86      0.24      0.37      4739
           5       0.72      0.71      0.71     20070
           6       0.68      0.66      0.67      9480
           7       0.64      0.76      0.70      8361
           8       0.50      0.56      0.53     11770
           9       0.70      0.76      0.73      9441
          10       0.52      0.24      0.33      4341
          11       0.77      0.79      0.78     23057
          12       0.69      0.70      0.70     24819
          13       0.77      0.67      0.72     15795
          14       0.60      0.76      0.67      8408
          15       0.59      0.22      0.32      5441
          16       0.90      0.75      0.82     19955
          17       0.86    

  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
opt_lstm = get_optimal_threshold(lstm_model)
print(opt_lstm)

0.4


In [26]:
y_pred, y_true = get_pred(lstm_model, val_dl, threshold=opt_lstm)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.12      0.20      4836
           1       0.94      0.92      0.93     17969
           2       0.95      0.87      0.91      4036
           3       0.81      0.58      0.68      5886
           4       0.71      0.70      0.70      4739
           5       0.75      0.82      0.78     20070
           6       0.83      0.71      0.77      9480
           7       0.78      0.75      0.76      8361
           8       0.62      0.49      0.55     11770
           9       0.72      0.87      0.79      9441
          10       0.67      0.40      0.50      4341
          11       0.90      0.80      0.84     23057
          12       0.68      0.85      0.75     24819
          13       0.83      0.75      0.79     15795
          14       0.78      0.74      0.76      8408
          15       0.59      0.57      0.58      5441
          16       0.89      0.86      0.87     19955
          17       0.96    

  _warn_prf(average, modifier, msg_start, len(result))


## Inference

In [27]:
def predict(model, text, threshold=0.5):
    text = [vocab[token] for token in tokenize(text)][:SEQ_LEN]
    text += [PAD_IDX]*(SEQ_LEN - len(text))
    text_tensor = torch.tensor(text, device=torch.device('cuda'))
    text_tensor = text_tensor.view(1, -1)
    
    model.eval()
    with torch.no_grad():
        y_hat = model(text_tensor)
        y_hat = torch.where(nn.Sigmoid()(y_hat) >= threshold, 1, 0)
        y_hat = y_hat.detach().cpu().numpy()

    pred_tags_list = train_ds.mlb.inverse_transform(y_hat)

    return ','.join(pred_tags_list[0])

In [28]:
df = pd.read_csv('val.csv')
df.head()

Unnamed: 0,Body,Tags
0,custom seekbar thumb not transparent on lollip...,android
1,jaxb inheritance - non-abstract base class i'm...,java
2,sencha touch list with model and store i am lo...,javascript
3,"""flattening"" a list of dictionaries so my aim ...",python
4,"snmpwalk can't walk table with ""accessible-for...",c


In [29]:
df = df[df['Tags'].str.split(',').str.len() > 3][:5].reset_index(drop=True)
df

Unnamed: 0,Body,Tags
0,bootstrap list-items activating on embedded sp...,"javascript,jquery,html,css"
1,how can enable swipe image in slide by touch (...,"javascript,jquery,html,css"
2,how to use all php files in or outside folder ...,"javascript,php,jquery,html,css"
3,text appear below multi table cells after clic...,"php,javascript,html,css"
4,open new print dialog window with different ht...,"javascript,jquery,html,css"


Let us look at some predictions

In [30]:
rnn_pred = []
lstm_pred = []

for idx, row in df.iterrows():
    rnn_pred.append(predict(rnn_model, row['Body'], opt_rnn))
    lstm_pred.append(predict(lstm_model, row['Body'], opt_lstm))

df['RNN Tags'] = rnn_pred
df['LSTM Tags'] = lstm_pred

df

Unnamed: 0,Body,Tags,RNN Tags,LSTM Tags
0,bootstrap list-items activating on embedded sp...,"javascript,jquery,html,css","css,html","css,html,jquery"
1,how can enable swipe image in slide by touch (...,"javascript,jquery,html,css","javascript,jquery","javascript,jquery"
2,how to use all php files in or outside folder ...,"javascript,php,jquery,html,css",php,php
3,text appear below multi table cells after clic...,"php,javascript,html,css",html,"css,html,javascript"
4,open new print dialog window with different ht...,"javascript,jquery,html,css","html,javascript,python",javascript


In the above examples the LSTM model performs worse than RNN model but clasification report tells us that the LSTM model is clearly better on the validation data.