Part 1: Cosine Similarities

In [1]:
from gensim.models import FastText
from scipy import spatial

def get_sentence_embedding(model, sentence):
  # This method takes in the trained model and the input sentence
  # and returns the embedding of the sentence as the average embedding
  # of its words
  words = sentence.split(" ")
  vector = model.wv[words[0]].copy()
  for i in range(1, len(words)):
    vector += model.wv[words[i]]
  return vector/len(words)

In [2]:
import csv
from post_parser_record import PostParserRecord

def read_tsv_test_data(file_path):
  # Takes in the file path for test file and generate a dictionary
  # of question id as the key and the list of question ids similar to it
  # as value. It also returns the list of all question ids that have
  # at least one similar question
  dic_similar_questions = {}
  lst_all_test = []
  with open(file_path) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        question_id = int(row[0])
        lst_similar = list(map(int, row[1:]))
        dic_similar_questions[question_id] = lst_similar
        lst_all_test.append(question_id)
        lst_all_test.extend(lst_similar)
  return dic_similar_questions, lst_all_test

In [3]:
import random
def generate_negative_samples(dic, lst_of_ids):
  new_dic = {}
  lst = []
  cnt = 0
  for id in dic:
    for value in dic[id]:
      cnt += 1
      lst.append(value)
    new_dic[id] = []
    for _ in range(cnt):
      while True:
        random_item = random.choice(lst_of_ids)
        if random_item not in lst:
          break
      new_dic[id].append(random_item)
    cnt = 0
  return new_dic

In [4]:
# Get IDs for the questions we are testing for similar questions
positive_samples_id, lst_all_test = read_tsv_test_data("duplicate_questions.tsv")
post_reader = PostParserRecord("Posts_law.xml" )


In [5]:
lst_of_ids = []
for id in post_reader.map_questions:
  lst_of_ids.append(id)

In [6]:
# Helper method to get and format similar questions from id
def get_text_from_id(post_reader, dic):
  lst = []
  for id in dic:
    question = post_reader.map_questions[id]
    key = question.title + " " + question.body
    key = re.sub('<[^<]+?>', '', key)
    values = []
    for value in dic[id]:
      question = post_reader.map_questions[value]
      text = question.title + " " + question.body
      text = re.sub('<[^<]+?>', '', text)
      values.append(text)
    lst.append([key, values])
  return lst

Create a list of all the questions and answers use to train our model

In [8]:
import nltk
nltk.download('punkt')
import re
# Collecting sentences from questions and answers
lst_training_sentences = []
for question_id in post_reader.map_questions:
    if question_id in lst_all_test:
        continue
    question = post_reader.map_questions[question_id]
    title_sentences = nltk.sent_tokenize(question.title)
    processed_titles = []
    for sentence in title_sentences:
        sentence = re.sub('<[^<]+?>', '', sentence)
        words = nltk.word_tokenize(sentence)
        processed_titles.append(words)
    body_sentences = nltk.sent_tokenize(question.body)
    process_bodies = []
    for sentence in body_sentences:
        sentence = re.sub('<[^<]+?>', '', sentence)
        words = nltk.word_tokenize(sentence)
        process_bodies.append(words)
    # Collecting sentences from title and body
    lst_training_sentences.extend(processed_titles)
    lst_training_sentences.extend(process_bodies)

    lst_answers = question.answers
    if lst_answers is not None:
        for answer in lst_answers:
            answer_sentences = nltk.sent_tokenize(answer.body)
            processed_answers = []
            for sentence in answer_sentences:
                sentence = re.sub('<[^<]+?>', '', sentence)
                words = nltk.word_tokenize(sentence)
                processed_answers.append(words)
            lst_training_sentences.extend(processed_answers)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shady\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def train_model(lst_sentences):
  model = FastText(vector_size = 100, window = 5, min_n=1, sg = 1)
  model.build_vocab(corpus_iterable=lst_sentences)
  model.train(corpus_iterable=lst_sentences, total_examples=len(lst_sentences), epochs=10)
  return model

In [10]:
# Train your model
model = train_model(lst_training_sentences)
model.save("fast.model")

In [11]:
model = FastText.load("fast.model")

In [12]:
# Helper method to get vector representation
import numpy as np
def get_all_embeddings(model, dic, map):  
    new_dic = {}
    for id in dic:
        question = map[id]
        title = re.sub('<[^<]+?>', '', question.title)
        body = re.sub('<[^<]+?>', '', question.body)
        title_sentences = nltk.sent_tokenize(title)
        body_sentences = nltk.sent_tokenize(body)
        lst_title_embed = []
        lst_body_embed = []
        for sentence in title_sentences:
            lst_title_embed.append(get_sentence_embedding(model, sentence))
        avg_title_embed = np.mean(lst_title_embed, axis=0)
        for sentence in body_sentences:
            lst_body_embed.append(get_sentence_embedding(model, sentence))
        avg_body_embed = np.mean(lst_body_embed, axis=0)
        new_dic[id] = [avg_title_embed, avg_body_embed]
    return new_dic

In [58]:
all_embeddings = get_all_embeddings(model, post_reader.map_questions, post_reader.map_questions)
all_positive_embeddings = get_all_embeddings(model, positive_samples_id, post_reader.map_questions)

In [56]:
def cosine_similarity_model(model, all_embeddings, all_positive_embeddings, positive_samples_id):
  # Finding similar questions using fastText model
  dictionary_result_by_title = {}
  dictionary_result_by_body = {}
  for test_question_id in positive_samples_id:
    max_title_similarity = -1
    max_body_similarity = -1
    most_similar_question_id_by_title = -1
    most_similar_question_id_by_body = -1
    for question_id in all_embeddings:
      # We are not comparing a question with itself
      if question_id == test_question_id:
        continue

      # Calculate the cosine similarity between the questions
      title_similarity = 1 - spatial.distance.cosine(all_positive_embeddings[test_question_id][0], all_embeddings[question_id][0])
      body_similarity = 1 - spatial.distance.cosine(all_positive_embeddings[test_question_id][1], all_embeddings[question_id][1])

      # Save the question id with the highest cosine similarity
      if title_similarity > max_title_similarity:
        max_title_similarity = title_similarity
        most_similar_question_id_by_title = question_id

      if body_similarity > max_body_similarity:
        max_body_similarity = title_similarity
        most_similar_question_id_by_body = question_id
        
    dictionary_result_by_title[test_question_id] = most_similar_question_id_by_title
    dictionary_result_by_body[test_question_id] = most_similar_question_id_by_body

  # Calculate average P@1
  p_at_1_sum = 0
  for id in dictionary_result_by_title:
    if dictionary_result_by_title[id] in positive_samples_id[id]:
      #print(f"title match: {dictionary_result_by_title[id]}")
      p_at_1_sum += 1
  p_at_1_avg_by_title = p_at_1_sum / len(dictionary_result_by_title)
  print(f"{p_at_1_sum} matches out of {len(dictionary_result_by_title)} questions")
  print(f"p@1 average for question titles: {p_at_1_avg_by_title}")
  # Calculate average P@1
  p_at_1_sum = 0
  for id in dictionary_result_by_body:
    if dictionary_result_by_body[id] in positive_samples_id[id]:
      #print(f"body match: {dictionary_result_by_body[id]}")
      p_at_1_sum += 1
  p_at_1_avg_by_body = p_at_1_sum / len(dictionary_result_by_body)
  print(f"{p_at_1_sum} matches out of {len(dictionary_result_by_body)} questions")
  print(f"p@1 average for question bodies: {p_at_1_avg_by_body}")

In [15]:
cosine_similarity_model(model, all_embeddings, all_positive_embeddings, positive_samples_id)

32 matches out of 282 questions
p@1 average for question titles: 0.11347517730496454
9 matches out of 282 questions
p@1 average for question bodies: 0.031914893617021274


Part 2

In [70]:
import torch
import torch.nn as nn

class FeedForwardNeuralNetwork(nn.Module):
  def __init__(self, input_dim, hidden_dim_1, hidden_dim_2, output_dim):
    super(FeedForwardNeuralNetwork, self).__init__()

    self.layer_1 = nn.Linear(input_dim, hidden_dim_1)
    self.relu_1 = nn.ReLU()

    self.layer_2 = nn.Linear(hidden_dim_1, hidden_dim_2)
    self.relu_2 = nn.ReLU()
    
    self.layer_3 = nn.Linear(hidden_dim_2, output_dim)

  def forward(self, x):
    out = self.layer_1(x)
    out = self.relu_1(out)

    out = self.layer_2(out)
    out = self.relu_2(out)
    
    out = self.layer_3(out)
    return torch.sigmoid(out)

In [71]:
import torch
import torch.optim as optim
import matplotlib.pyplot as plt

class myModel():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_dim = 100 
    hidden_dim_1 = 128
    hidden_dim_2 = 64
    out_dim = 1 

    model = FeedForwardNeuralNetwork(input_dim, hidden_dim_1, hidden_dim_2, out_dim)

    # loss function
    criterion = nn.CrossEntropyLoss()

    # optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # moving to GPU if available
    model.to(device)
    criterion = criterion.to(device)

    def calculate_accuracy(self, y_true, y_pred):
        y_pred = torch.round(y_pred)
        correct = (y_true == y_pred).float()
        acc = correct.sum() / len(correct)
        return acc

    def training(self, tfidfX_train, Y_train, tfidfX_val, Y_val, num_epochs):
        batch_size = 200
        X_train_mini_batches = torch.split(tfidfX_train, batch_size)
        Y_train_mini_batches = torch.split(Y_train, batch_size)

        train_losses = []
        train_accuracies = []
        val_losses = []
        val_accuracies = []

        best_accuracy = 0

        for epoch in range(num_epochs):
            epoch_loss = 0
            epoch_accuracy = 0
            validation_loss = 0
            val_accuracy = 0
            for X_train_mini_batch, Y_train_mini_batch in zip(X_train_mini_batches, Y_train_mini_batches):
                X_train_mini_batch = X_train_mini_batch.to(self.device)
                Y_train_mini_batch = Y_train_mini_batch.to(self.device)

                # forward pass
                train_prediction = self.model.forward(X_train_mini_batch.float())

                # returns a tensor with all the dimensions of input of size 1 removed
                train_prediction = torch.squeeze(train_prediction)

                # calculate loss
                train_loss = self.criterion(train_prediction, Y_train_mini_batch)

                # clearing up acculated gradients
                self.optimizer.zero_grad()

                # getting gradients
                train_loss.backward()

                # updating parameters
                self.optimizer.step()

                # add each mini batch's loss
                epoch_loss += train_loss.item()

                # add each mini batch's accuracy
                epoch_accuracy += self.calculate_accuracy(Y_train_mini_batch, train_prediction)

            tfidfX_val = tfidfX_val.to(self.device)
            Y_val = Y_val.to(self.device)

            # Forward pass to get output
            val_prediction = self.model.forward(tfidfX_val.float())
            val_prediction = torch.squeeze(val_prediction)

            # Calculate Loss
            val_loss = self.criterion(val_prediction, Y_val)
            # print(val_loss)
            # Add each mini batch's loss
            validation_loss = val_loss.item()

            # Add each mini batch's accuracy
            val_accuracy = self.calculate_accuracy(Y_val, val_prediction)
            if val_accuracy > best_accuracy:
                torch.save(self.model.state_dict(), 'best_model_state.bin')
                best_accuracy = val_accuracy

    def __loadModel(self, ):
        self.model.load_state_dict(torch.load('best_model_state.bin'))

    def calc(self, Y_true, Y_pred):
        Y_true_list = Y_true.cpu().tolist()
        Y_pred_list = Y_pred.cpu().tolist()
        TP = TN = FP = FN = 0
        for true_label, predicted_label in zip(Y_true_list, Y_pred_list):
            predicted_label = round(predicted_label)
            if true_label == 1 and predicted_label == 1:
                TP += 1
            elif true_label == 0 and predicted_label == 0:
                TN += 1
            elif true_label == 0 and predicted_label == 1:
                FP += 1
            elif true_label == 1 and predicted_label == 0:
                FN += 1
        return TP, TN, FP, FN

    def test(self, tfidfX_test, Y_test):
        self.__loadModel()
        tfidfX_test = tfidfX_test.to(self.device)
        Y_test = Y_test.to(self.device)
        # forward pass to get output
        test_prediction = self.model.forward(tfidfX_test.float())
        test_prediction = torch.squeeze(test_prediction)

        # calculate accuracy
        test_accuracy = self.calculate_accuracy(Y_test, test_prediction)

        print(Y_test)
        print(test_prediction)

        TP, TN, FP, FN = self.calc(Y_test, test_prediction)


        print("True Positives:", TP)
        print("True Negatives:", TN)
        print("False Positives:", FP)
        print("False Negatives:", FN)
        print("Test Accuracy:", round(test_accuracy.item(), 4), "\n")

In [67]:
def get_all_embeddings_both(model, dic, map):  
    new_dic = {}
    for id in dic:
        question = map[id]
        title = re.sub('<[^<]+?>', '', question.title)
        body = re.sub('<[^<]+?>', '', question.body)
        title_sentences = nltk.sent_tokenize(title)
        body_sentences = nltk.sent_tokenize(body)
        lst_embed = []
        for sentence in title_sentences:
            lst_embed.append(get_sentence_embedding(model, sentence))
        for sentence in body_sentences:
            lst_embed.append(get_sentence_embedding(model, sentence))
        avg_embed1 = np.mean(lst_embed, axis=0)
        question = map[dic[id][0]]
        title = re.sub('<[^<]+?>', '', question.title)
        body = re.sub('<[^<]+?>', '', question.body)
        title_sentences = nltk.sent_tokenize(title)
        body_sentences = nltk.sent_tokenize(body)
        lst_embed = []
        for sentence in title_sentences:
            lst_embed.append(get_sentence_embedding(model, sentence))
        for sentence in body_sentences:
            lst_embed.append(get_sentence_embedding(model, sentence))
        avg_embed2 = np.mean(lst_embed, axis=0)
        new_dic[id] = avg_embed1 + avg_embed2
    return new_dic

In [48]:
negative_samples_id = generate_negative_samples(positive_samples_id, lst_of_ids)

In [49]:
all_positive_embeddings_both = get_all_embeddings_both(model, positive_samples_id, post_reader.map_questions)
all_negative_embeddings_both = get_all_embeddings_both(model, negative_samples_id, post_reader.map_questions)

In [72]:
import numpy as np
import random
from post_parser_record import PostParserRecord

modelFeedForward = myModel()
x = []
y = []

for id in positive_samples_id:
    x.append(all_positive_embeddings_both[id])
    y.append(1)
    x.append(all_negative_embeddings_both[id])
    y.append(0)
# Split the data into three sets
train_size = int(0.8 * len(x))
val_size = int(0.1 * len(x))

training_x = x[:train_size]
training_y = y[:train_size]

val_x = x[train_size:train_size+val_size]
val_y = y[train_size:train_size+val_size]

test_x = x[train_size+val_size:]
test_y = y[train_size+val_size:]




# (Question 1, Question 2)
# 0 = not duplicate, 1 = duplicate
training_x = torch.from_numpy(np.asarray(training_x)).type(torch.FloatTensor)
training_y = torch.from_numpy(np.asarray(training_y)).type(torch.FloatTensor)
val_x = torch.from_numpy(np.asarray(val_x)).type(torch.FloatTensor)
val_y = torch.from_numpy(np.asarray(val_y)).type(torch.FloatTensor)
print("training begins")
modelFeedForward.training(training_x, training_y, val_x, val_y, 180)

test_x = torch.from_numpy(np.asarray(test_x)).type(torch.FloatTensor)
test_y = torch.from_numpy(np.asarray(test_y)).type(torch.FloatTensor)
modelFeedForward.test(test_x, test_y)


training begins
tensor([0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0.])
tensor([1.5784e-01, 9.9880e-01, 1.9526e-01, 5.6627e-01, 4.9733e-02, 5.0210e-02,
        2.7983e-04, 4.5946e-02, 1.1795e-02, 8.9183e-01, 4.3503e-01, 4.6777e-01,
        5.5502e-01, 5.7859e-01, 4.9181e-02, 7.4047e-01, 8.7407e-01, 6.1477e-02,
        1.0639e-01, 9.8863e-01, 7.7202e-01, 5.3641e-03, 3.3013e-03, 9.9052e-01,
        2.6276e-02, 9.9962e-01, 1.4722e-01, 9.3124e-01, 5.4761e-01, 7.9023e-01,
        6.9279e-02, 9.9985e-01, 7.0472e-01, 9.8708e-01, 3.0379e-01, 7.7190e-03,
        6.4215e-02, 9.6759e-01, 2.5972e-03, 9.9031e-01, 4.0365e-02, 9.8776e-01,
        4.2870e-01, 9.9969e-01, 4.7910e-01, 5.5010e-01, 8.3100e-01, 4.8961e-01,
        6.6822e-02, 6.0577e-01, 9.9883e-01, 9.8310e-01, 9.3793e-01, 9.9759e-01,
   