# Environment set up and data import

In [0]:
!pip install kaggle --upgrade

In [0]:
## Importing the enecessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler
import torch.utils.data as data
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy
import random
import os
import zipfile
import torchvision.models as tvm
from torch.nn.utils.rnn import pack_padded_sequence as PACK
from torch.nn.utils.rnn import pad_packed_sequence
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
import urllib.request
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import KeyedVectors
import gensim

In [0]:
## Upload the kaggle .json file when prompted

from google.colab import files
uploaded = files.upload()

In [0]:
## Importing the data from the website

! mkdir -p ~/.kaggle/
! mv kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

! mkdir -p /content/crowdflower-problem
% cd /content/crowdflower-problem

!kaggle competitions download -c crowdflower-search-relevance

## Downloading the word2vec file

!wget -P /content -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
w2vmodel = gensim.models.KeyedVectors.load_word2vec_format('/content/GoogleNews-vectors-negative300.bin.gz', binary=True) 

In [0]:
## Unzipping the data

with zipfile.ZipFile("/content/crowdflower-problem/train.csv.zip", 'r') as zip_ref:
  zip_ref.extractall("/content/crowdflower-problem/train")

with zipfile.ZipFile("/content/crowdflower-problem/test.csv.zip", 'r') as zip_ref:
  zip_ref.extractall("/content/crowdflower-problem/test")

os.remove("test.csv.zip")
os.remove("train.csv.zip")

train_csv = pd.read_csv("/content/crowdflower-problem/train/train.csv",engine='python')
test_csv = pd.read_csv("/content/crowdflower-problem/test/test.csv",engine='python')

#Data Pre-processing

In [0]:
###Data cleaning function

def data_cleaning(data_csv):

  ## Replace Nan with blank spaces

  data_csv.loc[data_csv['product_description'].isnull(),'product_description'] = data_csv['product_title']
  ### lowercasing the tokens in columns
  data_csv['product_title'] = data_csv['product_title'].str.replace('\W', ' ')
  data_csv['product_description'] = data_csv['product_description'].str.replace('\W', ' ')

  data_csv['query_cleaned'] = data_csv['query'].str.lower().str.split()
  data_csv['product_title_cleaned'] = data_csv['product_title'].str.lower().str.split()
  
  data_csv['product_description_cleaned'] = data_csv['product_description'].str.lower().str.split()
  

  ### Removing stopwords
  data_csv['product_description_cleaned'] = data_csv['product_description_cleaned'].apply(lambda x: [item for item in x if item not in stop])
  data_csv['product_title_cleaned'] = data_csv['product_title_cleaned'].apply(lambda x: [item for item in x if item not in stop])
  return data_csv

###Query expansion function

def query_expansion(train_df):

  max_relevance_query = pd.DataFrame(train_df.groupby(['query'])['median_relevance'].max())
  query_expanded = pd.DataFrame(columns=['query','expanded_query'])

  for row in max_relevance_query.iterrows():
    query_name , relevance = row
    relevance = relevance.values[0]
    query_data = pd.DataFrame(train_df[(train_df['query'] == query_name) & (train_df['median_relevance'] == relevance)][["query_cleaned","product_title_cleaned"]])
    query_cleaned_ = query_data.iloc[0][0]
    query_titles = query_data["product_title_cleaned"].values.tolist()
    query_titles_words = query_data.iloc[0][0]
    for titles in query_titles:
      query_titles_words.extend(titles)
    titles_words = pd.DataFrame(query_titles_words)
    titles_words.columns = ["words"]
    count_words = pd.DataFrame(titles_words['words'].value_counts())
    count_words["Rank"] = count_words["words"].rank(ascending=False)
    count_words_subset = count_words[count_words["Rank"]<=15]
    words = list(count_words_subset.index)
    #words = query_cleaned_

    query_expand_dict = {"query":query_name,"expanded_query":words}
    query_expanded = query_expanded.append(query_expand_dict,ignore_index=True)


  return query_expanded

### Converting data to lists as the input for the dataset class

def data_format_list(train_query_expanded):
  query_desc_rel_var = train_query_expanded[["expanded_query","product_description_cleaned","median_relevance","relevance_variance"]]
  #query_desc_rel_var = train_query_expanded[["query","product_description_cleaned","median_relevance","relevance_variance"]]
  query_list = query_desc_rel_var["expanded_query"].values.tolist()
  desc_list = query_desc_rel_var["product_description_cleaned"].values.tolist()
  relevance = query_desc_rel_var["median_relevance"].values.tolist()
  
  return query_list,desc_list,relevance

### Function to generate the vocabulary and word indices

def get_word_to_ix(query_list,desc_list):
  word_to_ix = {}
  for query in query_list:
    for token in query:
      if token not in word_to_ix:
        word_to_ix[token] = len(word_to_ix)
  for desc in desc_list:
    for token in desc:
      if token not in word_to_ix:
        word_to_ix[token] = len(word_to_ix)

  return word_to_ix 

### Funtion to convert tokens to ids

def get_training_data(query_list,desc_list,relevance,word_to_ix):
  query_data = []
  desc_data = []
  for query in query_list:
    query_idxs = []
    for token in query:
      query_idxs.extend([word_to_ix[token]])
    query_data.append(query_idxs)

  for desc in desc_list:
    desc_idxs = []
    for token in desc:
      desc_idxs.extend([word_to_ix[token]])
    desc_data.append(desc_idxs)

  return query_data,desc_data,relevance

## Creating a weight matrix for our vocabulary. The weights are from the w2v model

def create_word2vec_weight_matrix (model,embedding_size,target_vocab):
  weights_matrix = np.zeros((len(target_vocab)+1,embedding_size))
  words_found = 0

  for key,value in target_vocab.items():
    try: 
      weights_matrix[value] = model.word_vec(key)
      words_found += 1
    except KeyError:
      weights_matrix[value] = np.random.normal(scale=0.6, size=(embedding_size, ))  

  return weights_matrix

In [0]:
## Executing the above functions
train_df = data_cleaning(train_csv)
test_df = data_cleaning(test_csv)
query_expanded = query_expansion(train_df)
train_query_expanded = train_df.merge(query_expanded,on = "query", how = "left")
train_query_expanded = train_df
query_list,desc_list,relevance = data_format_list(train_query_expanded)
word_to_ix = get_word_to_ix(query_list,desc_list)

###This has to be the input to the dataset class
query_data,desc_data,relevance = get_training_data(query_list,desc_list,relevance,word_to_ix)

### Creating the weight matrix
embedding_size = 300
weights_matrix = create_word2vec_weight_matrix(w2vmodel,embedding_size,word_to_ix)

# Dataset and dataloader wrapping for pytorch model

In [0]:
### Wrapping the data into a dataset class, as for all pytorch models.

class CrowdDataset (data.Dataset):

  def __init__(self,query_list,desc_list,relevance,word_to_ix):
    super(CrowdDataset,self).__init__()

    self.query_list = query_list
    self.desc_list = desc_list
    self.relevance = relevance
    self.no_of_samples = len(self.query_list)
    self.word_to_ix = word_to_ix

  def __getitem__(self,index):

    query_data = torch.LongTensor(self.query_list[index])
    desc_data = torch.LongTensor(self.desc_list[index])
    relevance = self.relevance[index]

    return query_data,desc_data,relevance

  def __len__(self):

    return self.no_of_samples

def collate_fn(data):

  '''
  The batch of varying length sequences has to be padded based on the longest sentence. This funtion does that during collating the batch in the dataloader
  It is done by using the pad_sequence function.
  '''
  
  query_unsorted = [x[0] for x in data]
  desc_unsorted = [x[1] for x in data]

  query_lengths_unsorted = torch.LongTensor([len(x) for x in query_unsorted])
  desc_lengths_unsorted = torch.LongTensor([len(x) for x in desc_unsorted])

  q_lengths_sorted, q_sorted_idx = query_lengths_unsorted.sort(descending=True)
  d_lengths_sorted, d_sorted_idx = desc_lengths_unsorted.sort(descending=True)
  
  sorted_batch_query = sorted(data, key=lambda x: x[0].shape[0], reverse=True)
  query_sorted = [x[0] for x in sorted_batch_query]
  query_padded = torch.nn.utils.rnn.pad_sequence(query_sorted, batch_first=True,padding_value = 27370)
  query_lengths = torch.LongTensor([len(x) for x in query_sorted])

  sorted_batch_desc = sorted(data, key=lambda x: x[1].shape[0], reverse=True)
  desc_sorted = [x[1] for x in sorted_batch_desc]
  desc_padded = torch.nn.utils.rnn.pad_sequence(desc_sorted, batch_first=True,padding_value = 27370)
  desc_lengths = torch.LongTensor([len(x) for x in desc_sorted])

  labels = [(x[2]-1) for x in data]
  
  return query_padded,desc_padded,query_lengths,desc_lengths,q_sorted_idx,d_sorted_idx,torch.tensor(labels)

###Creating validation data

## Sampling data for train and validation

def val_sampler (dataset,validation_split):

  dataset_len = len(dataset)
  indices = list(range(dataset_len))

  val_len = int(np.floor(validation_split * dataset_len))
  validation_idx = np.random.choice(indices, size=val_len, replace=False)
  train_idx = list(set(indices) - set(validation_idx))

  train_sampler = SubsetRandomSampler(train_idx)
  val_sampler = SubsetRandomSampler(validation_idx)

  train_actual_classes = [relevance[i] for i in train_sampler]
  val_actual_classes = [relevance[i] for i in val_sampler]

  return train_sampler,val_sampler,len(train_idx),val_len

In [0]:
## Calling the dataset class and the validation sampler function

dataset = CrowdDataset(query_data,desc_data,relevance,word_to_ix)
train_sampler,val_sampler,train_len,val_len = val_sampler(dataset,0.1)

In [0]:
## Creating the dataloader wrapper to batch the data

train_loader = torch.utils.data.DataLoader(dataset = dataset,batch_size= 50,sampler= train_sampler,collate_fn=collate_fn)
val_loader = torch.utils.data.DataLoader(dataset = dataset,batch_size= 50,sampler= val_sampler,collate_fn=collate_fn)

data_loaders = {"train": train_loader, "val": val_loader}
data_lengths = {"train": train_len, "val": val_len}

In [0]:
### Function to create the embedding layer using the weight matric created before

def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.weight.data.copy_(torch.from_numpy(weights_matrix))
    if non_trainable:
      emb_layer.weight.requires_grad = False
    else:
      emb_layer.weight.requires_grad = True
    return emb_layer, num_embeddings, embedding_dim

# Model architecture


In [0]:
'''
The model architecture is very similar to a siamese net. there are two LSTM heads, one for the query and one for the description. These LSTM heads are independent.
The output from these LSTM heads are then concatenated into a final feature vector and is directly used for the final fc layer and class probability calculation

The entire architecture is divided into two classes: LSTMencoder and Siamese_lstm.

'''

class LSTMencoder (nn.Module):

  def __init__(self, weights_matrix, hidden_dim_q_1,hidden_dim_q_2,hidden_dim_d_1,hidden_dim_d_2,device):
    super(LSTMencoder,self).__init__()

    self.embedding_token, num_embeddings, embedding_dim = create_emb_layer(weights_matrix)

    self.lstm_q_1 = nn.GRU(embedding_dim, hidden_dim_q_1,batch_first = True)
    self.lstm_q_2 = nn.LSTM(hidden_dim_q_1,hidden_dim_q_2,batch_first = True)

    self.lstm_d_1 = nn.GRU(embedding_dim, hidden_dim_d_1,batch_first = True)
    self.lstm_d_2 = nn.LSTM(hidden_dim_d_1,hidden_dim_d_2,batch_first = True)

    self.final_hidden_q = hidden_dim_q_2
    self.final_hidden_d = hidden_dim_d_2

    self.device = device

  def forward(self,query_batch,desc_batch,q_lengths,d_lengths,q_ix,d_ix):

    query_embedding = self.embedding_token(query_batch)
    desc_embedding = self.embedding_token(desc_batch)

    q_packed = PACK(query_embedding,q_lengths, batch_first=True)
    d_packed = PACK(desc_embedding,d_lengths,batch_first=True)

    output_q1, _ = self.lstm_q_1(q_packed)
    output_q2, _ = self.lstm_q_2(output_q1)

    output_d1, _ = self.lstm_d_1(d_packed)
    output_d2, _ = self.lstm_d_2(output_d1)
 
    output_q_padded, output_q_lengths = pad_packed_sequence(output_q2, batch_first=True)
    output_d_padded, output_d_lengths = pad_packed_sequence(output_d2, batch_first=True)

    
    idx_q = (torch.LongTensor(q_lengths) - 1).view(-1, 1).expand(len(q_lengths), output_q_padded.size(2))
    idx_d = (torch.LongTensor(d_lengths) - 1).view(-1, 1).expand(len(d_lengths), output_d_padded.size(2))

    '''
    The feature vector from the final time step of the sequence has to be extracted. This woould be the feature vector for the sequence

    '''  
    time_dimension = 1
    idx_q = idx_q.unsqueeze(time_dimension)
    idx_d = idx_d.unsqueeze(time_dimension)
    if (output_q_padded.is_cuda or output_d_padded.is_cuda):
      idx_q = idx_q.cuda(output_q_padded.data.get_device())
      idx_d = idx_d.cuda(output_q_padded.data.get_device())
    
    last_output_q = output_q_padded.gather(time_dimension, Variable(idx_q)).squeeze(time_dimension)
    last_output_d = output_d_padded.gather(time_dimension, Variable(idx_d)).squeeze(time_dimension)

    last_output_q = last_output_q.to(self.device)
    last_output_d = last_output_d.to(self.device)

    '''
    For padding the queries and desc batches, both the query and description batches were sorted. 
    Now before the final calculation we will have to re sort it to make sure the query and desc are matched as inputs

    '''
    
    last_output_query = torch.zeros_like(last_output_q).scatter_(0, Variable(q_ix).unsqueeze(1).expand(-1, last_output_q.shape[1]), last_output_q)
    last_output_desc = torch.zeros_like(last_output_d).scatter_(0, Variable(d_ix).unsqueeze(1).expand(-1, last_output_d.shape[1]), last_output_d)

    return last_output_q,last_output_d,last_output_query,last_output_desc

class Siamese_lstm(nn.Module):

  def __init__(self, weights_matrix, hidden_dim_q_1,hidden_dim_q_2,hidden_dim_d_1,hidden_dim_d_2,label_size,device):
    super(Siamese_lstm, self).__init__()
    self.encoder = LSTMencoder(weights_matrix,hidden_dim_q_1,hidden_dim_q_2,hidden_dim_d_1,hidden_dim_d_2,device)

    self.feature_len = 2*self.encoder.final_hidden_q
    self.feature_len2 = 32
    self.final_layer = nn.Linear(self.feature_len,self.feature_len2)
    self.final_layer_1 = nn.Linear(self.feature_len2,label_size)

  def forward(self,query_batch,desc_batch,q_lengths,d_lengths,q_ix,d_ix):

    _,_,query_feature,desc_feature = self.encoder(query_batch,desc_batch,q_lengths,d_lengths,q_ix,d_ix)

    final_vector = torch.cat((query_feature,desc_feature), 1)
    #final_vector = query_feature*desc_feature

    
    output = self.final_layer(final_vector)
    output_1 = self.final_layer_1(output)

    return output_1

# Model parameters and running the model

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Siamese_lstm(weights_matrix,32,64,32,64,4,device)
model = model.to(device) 

In [0]:
##Giving the loss function, optimizer and the scheduler

criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
exp_lr_scheduler = lr_scheduler.CyclicLR(optimizer_ft, base_lr = 0.000001 ,max_lr = 0.01)

num_epochs = 20

In [0]:
### Function to train the model

def train_model(model,criterion,optimizer,scheduler,num_epochs):
    
  since = time.time()
  best_model_wts = copy.deepcopy(model.state_dict())
  best_acc = 0.0
  
  for epoch in range(num_epochs):
      
    print('Epoch {}/{}'.format(epoch+1, num_epochs))
    print('-' * 10)
      
    for phase in ['train','val']:
          
      if (phase == 'train'):
        #scheduler.step()
        model.train()

      else:
        model.eval()
              
      running_loss = 0.0
      running_corrects = 0.0

      ##Iterate over data
          
      for query_batch,desc_batch,query_lengths,desc_lengths,q_sorted_idx,d_sorted_idx,labels in data_loaders[phase]:
        
        query_batch = query_batch.to(device)
        desc_batch = desc_batch.to(device)
        labels = labels.to(device)
        q_sorted_idx = q_sorted_idx.to(device)
        d_sorted_idx = d_sorted_idx.to(device)
              
        optimizer.zero_grad()
              
        with torch.set_grad_enabled(phase=='train'):
          outputs = model(query_batch,desc_batch,query_lengths,desc_lengths,q_sorted_idx,d_sorted_idx)
          _,preds = torch.max(outputs,1)
          
          loss = criterion(outputs,labels)
          
          if(phase=='train'):
            loss.backward(retain_graph=True)
            optimizer.step()          
        ##Statistics
              
        running_loss += loss.item() * query_batch.size(0)
        running_corrects += torch.sum(preds == labels) 

      epoch_loss = running_loss / data_lengths[phase]
      epoch_acc = running_corrects.double() / data_lengths[phase]

      print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
      
      if (phase == 'train'):
        scheduler.step() 
      
      if phase == 'val' and epoch_acc > best_acc:
          best_acc = epoch_acc
          best_model_wts = copy.deepcopy(model.state_dict())
                  
    print()
      
  time_elapsed = time.time() - since
  print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
  print('Best Accuracy: {:4f}'.format(best_acc))
      
  model.load_state_dict(best_model_wts)
  return model 

In [0]:
### Calling the model function
model_1 = train_model(model,criterion,optimizer_ft,exp_lr_scheduler,num_epochs)