![alt text](https://cdn.sstatic.net/Sites/stackoverflow/company/img/logos/se/se-logo.svg?v=d29f0785ebb7)

The objective of notebook is to build a model to automatically predict tags for a given a StackExchange question by using the text of the question in PyTorch using TorchText.

Dataset:Over 85,000 questions and over 1300 unique tags

The question-answering site StackOverflow allows users to assign tags to questions in order to make them easier for other people to find. Further experts on a certain topic can subscribe to tags to receive digests of new questions for which they might have an answer. Therefore it is both in the interest of the original poster and in the interest of people who are interested in the answer that a question gets assigned appropriate tags.


# Please upvote the notebook if you find it insightful!

# Load Data and Import Libraries

In [None]:
!pip install beautifulsoup4

In [None]:

import re 
import pandas as pd
pd.set_option('display.max_colwidth', 200)

import numpy as np
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt  

from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer

import random

import torch
from torch import nn
from torch.nn import Sequential, Linear,  ReLU, Sigmoid, Dropout, BCELoss, Embedding, RNN, LSTM
from torchtext import data 

In [None]:
# load the stackoverflow questions dataset
questions_df = pd.read_csv('../input/statsquestions/Questions.csv',encoding='latin-1')

# load the tags dataset
tags_df = pd.read_csv('../input/statsquestions/Tags.csv')

In [None]:
questions_df.head()

In [None]:
questions_df.shape

In [None]:
tags_df.head()

In [None]:
# No. of unique tags
len(tags_df['Tag'].unique())

# Dataset Preparation

In [None]:
# remove "-" from the tags
tags_df['Tag'] = tags_df['Tag'].apply(lambda x:re.sub("-"," ",x))

In [None]:
# group tags Id wise
tags_df = tags_df.groupby('Id').apply(lambda x:x['Tag'].values).reset_index(name='tags')
tags_df.head()

In [None]:
# merge tags and questions
df = pd.merge(questions_df,tags_df,how='inner',on='Id')

In [None]:
# fetch required columns
df = df[['Id','Body','tags']]

In [None]:
#first 5 rows
df.head()

In [None]:
#shape of the dataset
df.shape

In [None]:
# check occurence of each tag
freq={}
for i in df['tags']:
  for j in i:
    if j in freq.keys():
      freq[j] = freq[j] + 1
    else:
      freq[j] = 1

In [None]:
# sort the dictionary in descending order
freq = dict(sorted(freq.items(), key=lambda x:x[1],reverse=True))

In [None]:
# Top 10 most frequent tags
common_tags = list(freq.keys())[:10]
print(common_tags)

We will use only those questions/queries that are associated with the top 10 tags.

In [None]:
#finding queries associated with common tags
x=[]
y=[]

for i in range(len(df['tags'])):  

  temp=[]
  for j in df['tags'][i]:
    if j in common_tags:
      temp.append(j)
  
  #if common tags are more than 1
  if(len(temp)>1):
    x.append(df['Body'][i])
    y.append(temp)

In [None]:
# number of questions left
len(x)

In [None]:
#first 5 tags
y[:5]

In [None]:
#combining the labels by space
y = [ ",".join([str(j) for j in i ]) for i in y]

In [None]:
#labels after converting to string
y[:5]

In [None]:
#save to dataframe
dframe = pd.DataFrame({'query':x,'tags':y})

In [None]:
#first 5 rows
dframe.head()

In [None]:
#save to csv
dframe.to_csv('stack.csv',index=False)

# Text Preprocessing

In [None]:
def cleaner(text):

  text = BeautifulSoup(text).get_text()
  
  # fetch alphabetic characters
  text = re.sub("[^a-zA-Z]", " ", text)

  # convert text to lower case
  text = text.lower()

  # split text into tokens to remove whitespaces
  tokens = text.split()
  
  return tokens

In [None]:
#define field object for query
max_len = 100
TEXT = data.Field(tokenize=cleaner, batch_first=True, fix_length=max_len)

In [None]:
#define field object for label
LABEL = data.LabelField(batch_first=True)

Next we are going to create a list of tuples where first value in every tuple contains a column name and second value is a field object.

In [None]:
#define a list of tuple with field objects
fields = [('query',TEXT),('tags', LABEL)]

In [None]:
#reading the dataset
training_data = data.TabularDataset(path = 'stack.csv', format = 'csv', fields = fields, skip_header = True)

In [None]:
print(training_data)

In [None]:
#print preprocessed text
print(vars(training_data.examples[0]))

In [None]:
train_data, valid_data = training_data.split(split_ratio=0.8, random_state = random.seed(32))

In [None]:
#preparing the vocabulary for the text
TEXT.build_vocab(train_data, min_freq=3)

In [None]:
#No. of unique words
len(TEXT.vocab)

In [None]:
#word index
list(TEXT.vocab.stoi.items())[:10]

In [None]:
def fetch_text(examples):

  text=[]
  for example in examples:
    query = vars(example)['query']
    text.append(query)
    
  return text

In [None]:
train_text = fetch_text(train_data)
valid_text = fetch_text(valid_data)

In [None]:
def convert2seq(text):
  
  #padding
  text = TEXT.pad(text)
  
  #converting to numbers
  text = TEXT.numericalize(text)
  
  return text

In [None]:
X_train = convert2seq(train_text)
X_valid = convert2seq(valid_text)

In [None]:
X_train[0]

In [None]:
X_train.shape, X_valid.shape

In [None]:
def fetch_tags(data):
  tags=[]
  for example in data.examples:
    tags.append(vars(example)['tags'])
  return tags

In [None]:
train_tags = fetch_tags(train_data)
valid_tags = fetch_tags(valid_data)

In [None]:
train_tags[:5]

In [None]:
#preparing the output labels 
train_tags_list=[i.split(",") for i in train_tags]
valid_tags_list=[i.split(",") for i in valid_tags]

In [None]:
mlb= MultiLabelBinarizer()
mlb.fit(train_tags_list)

In [None]:
mlb.classes_

In [None]:
y_train  = mlb.transform(train_tags_list)
y_valid  = mlb.transform(valid_tags_list)

In [None]:
y_train.shape, y_valid.shape

In [None]:
type(y_train)

In [None]:
y_train = torch.FloatTensor(y_train)
y_valid = torch.FloatTensor(y_valid)

In [None]:
type(y_train)

# Model Building for RNN

In [None]:
# define embedding layer
emb = Embedding(num_embeddings=len(TEXT.vocab), embedding_dim=50)

In [None]:
X_train[:1].shape

In [None]:
# check sample input
sample_embedding = emb(X_train[:1])

In [None]:
sample_embedding.shape

In [None]:
#define a rnn
rnn = RNN(input_size=50, hidden_size=128, batch_first=True, nonlinearity='relu')

In [None]:
#pass the input to rnn
hidden_states,last_hidden_state = rnn(sample_embedding)

In [None]:
#Hidden state of every timestep (Batch, seq_len, no. of hidden neurons)
hidden_states.shape

In [None]:
#output shape of last hidden timestep
last_hidden_state.shape

In [None]:
#reshaping the hidden states
reshaped = hidden_states.reshape(hidden_states.size(0),-1)
reshaped.shape

In [None]:
class Net(nn.Module):
    
    #define all the layers used in model
    def __init__(self):
        
        #Constructor
        super(Net, self).__init__()   
        
        self.rnn_layer = nn.Sequential(
            
            #embedding layer [batch_size,vocab_size]
            Embedding(num_embeddings=len(TEXT.vocab), embedding_dim=50),
        
            #rnn layer [batch_size,100,128]
            RNN(input_size=50, hidden_size=128, nonlinearity='relu',batch_first=True)
          
            )

        self.dense_layer = nn.Sequential(
            
            #[batch_size,100*128]
            Linear(12800, 128),

            ReLU(),

            #[batch_size,128]
            Linear(128,10),
            
            #[batch_size,10]
            Sigmoid()

        )

    def forward(self, x):
        
        #rnn layer
        hidden_states, last_hidden_state = self.rnn_layer(x)

        #reshaping
        hidden_states = hidden_states.reshape(hidden_states.size(0),-1)

        #dense layer
        outputs=self.dense_layer(hidden_states)
        
        return outputs

In [None]:
#define the model
model = Net()

In [None]:
#model layers
model

In [None]:
with torch.no_grad():
  pred = model(X_train[:1])
  print(pred)

In [None]:
#define optimizer and loss
optimizer = torch.optim.Adam(model.parameters())
criterion = BCELoss()

# checking if GPU is available
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()

# Model Training

In [None]:
# define training function
def train(X,y,batch_size):

  #activate training phase
  model.train()
  
  #initialization
  epoch_loss= 0
  no_of_batches = 0

  #randomly create indices
  indices= torch.randperm(len(X))
  
  #loading in batches
  for i in range(0,len(indices),batch_size):
    
    #indices for a batch
    ind = indices[i:i+batch_size]
  
    #batch  
    batch_x=X[ind]
    batch_y=y[ind]
    
    #push to cuda
    if torch.cuda.is_available():
        batch_x, batch_y = batch_x.cuda(), batch_y.cuda()

    #clear gradients
    optimizer.zero_grad()
          
    #forward pass
    outputs = model(batch_x)

    #converting to a 1 dimensional tensor
    outputs = outputs.squeeze()

    #calculate loss and accuracy
    loss = criterion(outputs, batch_y)
    
    #Backward pass
    loss.backward()
    
    #Update weights
    optimizer.step()

    #Keep track of the loss and accuracy of a epoch
    epoch_loss = epoch_loss + loss.item()

    #No. of batches
    no_of_batches = no_of_batches+1

  return epoch_loss/no_of_batches

In [None]:
# define evaluation function
def evaluate(X,y,batch_size):

  #deactivate training phase
  model.eval()

  #initialization
  epoch_loss = 0
  no_of_batches = 0

  #randomly create indices
  indices= torch.randperm(len(X))

  #deactivates autograd
  with torch.no_grad():
    
    #loading in batches
    for i in range(0,len(indices),batch_size):
      
      #indices for a batch
      ind = indices[i:i+batch_size]
  
      #batch  
      batch_x= X[ind]
      batch_y= y[ind]

      #push to cuda
      if torch.cuda.is_available():
          batch_x, batch_y = batch_x.cuda(), batch_y.cuda()
        
      #Forward pass
      outputs = model(batch_x)

      #converting the output to 1 Dimensional tensor
      outputs = outputs.squeeze()

      # Calculate loss and accuracy
      loss = criterion(outputs, batch_y)
      
      #keep track of loss and accuracy of an epoch
      epoch_loss = epoch_loss + loss.item()

      #no. of batches
      no_of_batches = no_of_batches + 1

    return epoch_loss/no_of_batches

In [None]:
# define prediction function
def predict(X,batch_size):
  
  #deactivate training phase
  model.eval()

  # initialization 
  predictions = []

  # create indices
  indices = torch.arange(len(X))

  #deactivates autograd
  with torch.no_grad():
      
      for i in range(0, len(X), batch_size):
        
        #indices for a batch
        ind = indices[i:i+batch_size]

        # batch
        batch_x = X[ind]

        #push to cuda
        if torch.cuda.is_available():
            batch_x = batch_x.cuda()

        #Forward pass
        outputs = model(batch_x)

        #converting the output to 1 Dimensional tensor
        outputs = outputs.squeeze()

        # convert to numpy array
        prediction = outputs.data.cpu().numpy()
        predictions.append(prediction)
    
  # convert to single numpy array
  predictions = np.concatenate(predictions, axis=0)
    
  return predictions

In [None]:
N_EPOCHS = 10
batch_size = 32

# intialization
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss   = train(X_train, y_train, batch_size)
    
    #evaluate the model
    valid_loss   = evaluate(X_valid, y_valid, batch_size)

    print('\nEpoch :',epoch,
          'Training loss:',round(train_loss,4),
          '\tValidation loss:',round(valid_loss,4))

    #save the best model
    if best_valid_loss >= valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt') 
        print("\n----------------------------------------------------Saved best model------------------------------------------------------------------")   

# Model Evaluation

In [None]:
#load weights of best model
path='saved_weights.pt'
model.load_state_dict(torch.load(path))

In [None]:
#predict probabilities
y_pred_prob = predict(X_valid, batch_size)

In [None]:
y_pred_prob[0]

In [None]:
#actual tags
y_true = y_valid.cpu().numpy()

In [None]:
#define candidate threshold values
threshold  = np.arange(0,0.5,0.01)
print(threshold)

In [None]:
# convert probabilities into classes or tags based on a threshold value
def classify(y_pred_prob, thresh):
  
  y_pred = []

  for i in y_pred_prob:
    temp=[]
      
    for j in i:
      if j>=thresh:
        temp.append(1)
      else:
        temp.append(0)
    
    y_pred.append(temp)

  return np.array(y_pred)

In [None]:
score=[]

for thresh in threshold:
    
    #classes for each threshold
    y_pred = classify(y_pred_prob, thresh) 

    #convert to 1d array
    y_pred_1d    =  y_pred.ravel()
    y_true_1d    =  y_true.ravel()
 
    score.append(metrics.f1_score(y_true_1d, y_pred_1d))

In [None]:
# find the optimal threshold
opt = threshold[score.index(max(score))]
print(opt)

In [None]:
#predictions for optimal threshold
y_pred = classify(y_pred_prob, opt)

In [None]:
#converting to 1D
y_pred_1d = y_pred.ravel()

#Classification report
print(metrics.classification_report(y_true_1d, y_pred_1d))

In [None]:
#convert back to tags
y_pred_label = mlb.inverse_transform(np.array(y_pred))
y_true_label = mlb.inverse_transform(np.array(y_true))

# get all validation text
queries = [" ".join(i) for i in valid_text]

# create a dataframe to show the data and prediction side by side
df = pd.DataFrame({'Questions':queries,'Actual Tags':y_true_label,'Predicted Tags':y_pred_label})

# print first five rows
df.head()

# Show Inference

In [None]:
#raw text
text = "For example, in the case of logistic regression, the learning function is a Sigmoid function that tries to separate the 2 classes"

In [None]:
#cleaning text
tokens = cleaner(text)
tokens[:5]

In [None]:
#first argument to the model is no. of samples
tokens = np.array(tokens).reshape(-1,len(tokens))
tokens.shape

In [None]:
#converting text to integer sequences
seq = convert2seq(tokens)
seq

In [None]:
#predictions
with torch.no_grad():
  if torch.cuda.is_available():
    seq = seq.cuda()
  pred_prob= model(seq)
  print(pred_prob)

In [None]:
#classify
pred = classify(pred_prob,opt)
pred

In [None]:
tags  = mlb.inverse_transform(pred)[0]
tags

In [None]:
def predict_tags(text):
  
  tokens = cleaner(text)
  
  tokens = np.array(tokens).reshape(-1,len(tokens))
  
  seq = convert2seq(tokens)
  
  with torch.no_grad():
    if torch.cuda.is_available():
      seq = seq.cuda()

  pred_prob= model(seq)
  pred = classify(pred_prob,opt)
  
  tags  = mlb.inverse_transform(pred)[0]
  
  return tags

In [None]:
text = "For example, in the case of logistic regression, the learning function is a Sigmoid function that tries to separate the 2 classes"

tags = predict_tags(text)
print("Query: ", text)
print("Predicted tags:",tags)

# Model Building for LSTM

In [None]:
sample_embedding.shape

In [None]:
#define an LSTM
lstm_layer = LSTM(input_size=50, hidden_size=128, batch_first=True)

In [None]:
#pass the input to LSTM
hidden_states, (last_hidden_state,last_cell_state) = lstm_layer(sample_embedding)

In [None]:
#Hidden state of every timestep (Batch, seq_len, no. of hidden neurons)
hidden_states.shape

In [None]:
#output shape of last hidden timestep
last_hidden_state.shape

In [None]:
#output shape of last cell state
last_cell_state.shape

In [None]:
#reshaping the hidden states
reshaped = hidden_states.reshape(hidden_states.size(0),-1)
reshaped.shape

In [None]:
class Net(nn.Module):
    
    #Constructor
    def __init__(self):

        #Constructor
        super(Net, self).__init__()   
  
        #rnn block
        self.lstm_layer = Sequential(
            
            #embedding layer
            Embedding(num_embeddings=len(TEXT.vocab), embedding_dim=100),
        
            #lstm layer
            LSTM(input_size=100, hidden_size=128, batch_first=True)
          
            )

        #dense block
        self.dense_layer = Sequential(
            
            Linear(12800,128),

            ReLU(),

            Linear(128,10),
            
            Sigmoid()

        )
    
    #forward pass
    def forward(self, x):
        
        #rnn layer
        hidden_states, (last_hidden_state,last_cell_state) = self.lstm_layer(x)

        #flattening
        hidden_states = hidden_states.reshape(hidden_states.size(0),-1)
        
        #dense layer
        outputs=self.dense_layer(hidden_states)
        
        return outputs

In [None]:
#define the model
model = Net()

In [None]:
#layers of the model
model

In [None]:
with torch.no_grad():
  pred = model(X_train[:1])
  print(pred)

In [None]:
#define optimizer and loss
optimizer = torch.optim.Adam(model.parameters())
criterion = BCELoss()

# checking if GPU is available
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()

In [None]:
N_EPOCHS = 10
batch_size = 32

# intialization
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss   = train(X_train, y_train, batch_size)
    
    #evaluate the model
    valid_loss   = evaluate(X_valid, y_valid, batch_size)

    print('\nEpoch :',epoch,
          'Training loss:',round(train_loss,4),
          '\tValidation loss:',round(valid_loss,4))

    #save the best model 
    if best_valid_loss >= valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt') 
        print("\n----------------------------------------------------Saved best model------------------------------------------------------------------")   



# Model Evaluation

In [None]:
#load weights of best model
path='saved_weights.pt'
model.load_state_dict(torch.load(path))

In [None]:
#predict probabilities
y_pred_prob = predict(X_valid, batch_size)

In [None]:
y_pred_prob[0]

In [None]:
score=[]

for thresh in threshold:
    
    #classes for each threshold
    y_pred = classify(y_pred_prob, thresh) 

    #convert to 1d array
    y_pred_1d    =  y_pred.ravel()
    y_true_1d    =  y_true.ravel()
 
    score.append(metrics.f1_score(y_true_1d, y_pred_1d))

In [None]:
# find the optimal threshold
opt = threshold[score.index(max(score))]
print(opt)

In [None]:
#predictions for optimal threshold
y_pred = classify(y_pred_prob, opt)

In [None]:
#converting to 1D
y_pred_1d = y_pred.ravel()

#Classification report
print(metrics.classification_report(y_true_1d, y_pred_1d))

In [None]:
y_pred_label = mlb.inverse_transform(np.array(y_pred))

In [None]:
df = pd.DataFrame({'comment':queries,'actual':y_true_label,'predictions':y_pred_label})

In [None]:
df.head()

In [None]:
text = "For example, in the case of logistic regression, the learning function is a Sigmoid function that tries to separate the 2 classes"

tags = predict_tags(text)
print("Query: ",text)
print("Predicted tags:",tags)