Reading Law Stack Exchange Data

In [None]:
import csv
from post_parser_record import PostParserRecord
from gensim.models import FastText
import nltk
nltk.download('punkt')
import re
import numpy as np

def read_tsv_test_data(file_path):
  # Takes in the file path for test file and generate a dictionary
  # of question id as the key and the list of question ids similar to it
  # as value. It also returns the list of all question ids that have
  # at least one similar question
  dic_similar_questions = {}
  lst_all_test = []
  with open(file_path) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        question_id = int(row[0])
        lst_similar = list(map(int, row[1:]))
        dic_similar_questions[question_id] = lst_similar
        lst_all_test.append(question_id)
        lst_all_test.extend(lst_similar)
  return dic_similar_questions, lst_all_test


def train_model(lst_sentences):
  #model = None
  model = FastText(
        sentences=lst_sentences,
        vector_size=100,    # size of the word vectors
        window=5,    # window size for the skip-gram model
        min_count=5, # minimum count of words to include in the vocabulary
        sg=1,        # use skip-gram model
        workers=4    # number of worker threads to use
    )
  model.build_vocab(corpus_iterable=lst_sentences)
  # train the model
  model.train(
      corpus_iterable=lst_sentences,
      total_examples=len(lst_sentences),
      epochs=10
  )

  return model

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

# separating out to save time
duplicate_file = "duplicate_questions.tsv"
post_file = "Posts_law.xml"
dic_similar_questions, lst_all_test = read_tsv_test_data(duplicate_file)
#print(dic_similar_questions)
post_reader = PostParserRecord(post_file)
lst_training_sentences = []
embeddings = {}
for question_id in post_reader.map_questions:
  if question_id in lst_all_test:
    continue
  question = post_reader.map_questions[question_id]
  title = question.title
  body = question.body
  # Collect sentences here
  processed_title = re.sub('<[^<]+?>', ' ', title)
  token_title = nltk.sent_tokenize(processed_title)
  processed_body = re.sub('<[^<]+?>', ' ', title)
  token_body = nltk.sent_tokenize(processed_body)

  lst_training_sentences.extend(token_title)
  lst_training_sentences.extend(token_body)

In [None]:
# train your model
model = train_model(lst_training_sentences)

# save model
model.save("fastText.model");



In [None]:
def main():
  # get embeddings for each sentence and average them
  title_embedding = np.zeros(100)
  body_embedding = np.zeros(100)
  for sentence in token_title:
      title_embedding += get_sentence_embedding(model, sentence)
  title_embedding /= len(token_title)
  for sentence in token_body:
      body_embedding += get_sentence_embedding(model, sentence)
  body_embedding /= len(token_body)

  embeddings[question_id] = np.concatenate((title_embedding, body_embedding))

  lst_answers = question.answers
  if lst_answers is not None:
    for answer in lst_answers:
      answer_body = answer.body
      # Collection sentences here
      answer_body = re.sub('<[^<]+?>', ' ', answer_body)
      # Tokenize the answer body
      answer_sents = nltk.word_tokenize(answer_body)
      # Add each sentence to the list
      lst_training_sentences.extend(answer_sents)
      
# use your model and calculate the cosine similarity between the questions
# save the question id with the highest cosine similarity
# finding Similar questions using fastText model
  for test_question_id in dic_similar_questions:
    test_question_embedding = embeddings[test_question_id]
    for similar_question_id in dic_similar_questions[test_question_id]:
      similar_question_embedding = embeddings[similar_question_id]
      similarity = 1 - spatial.distance.cosine(test_question_embedding, similar_question_embedding)
      print("Cosine Similarity between question", test_question_id, "and similar question", similar_question_id, ":", similarity)
          
  # finding Similar questions using fastText model
  total_p_1 = 0.0
  for test_question_id in dic_similar_questions:
    test_question = dic_similar_questions[test_question_id]['Question']
    expected_duplicate_id = dic_similar_questions[test_question_id]['DuplicateId']
    predicted_duplicate_id = model.wv.most_similar(test_question, topn=1)[0][0]
    if predicted_duplicate_id == expected_duplicate_id:
        total_p_1 += 1.0
    else:
        total_p_1 += 0.0
    dictionary_result[test_question_id] = predicted_duplicate_id

  # calculate average P@1
  num_test_questions = len(dic_similar_questions)
  avg_p_1 = total_p_1 / num_test_questions
  print("Average P@1: {:.4f}".format(avg_p_1))

main()

Question 2: FFNN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Define the neural network architecture
class FeedForwardNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(FeedForwardNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

# Define the dataset
class QuestionPairDataset(Dataset):
    def __init__(self, pairs, labels):
        self.pairs = pairs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        question1 = self.pairs[idx][0]
        question2 = self.pairs[idx][1]
        label = self.labels[idx]
        return {'question1': question1, 'question2': question2, 'label': label}

In [None]:
# Split the data into training, validation, and test sets
train_size = int(0.8 * len(pairs))
val_size = int(0.1 * len(pairs))
test_size = len(pairs) - train_size - val_size

train_pairs = pairs[:train_size]
train_labels = labels[:train_size]

val_pairs = pairs[train_size:train_size+val_size]
val_labels = labels[train_size:train_size+val_size]

test_pairs = pairs[train_size+val_size:]
test_labels = labels[train_size+val_size:]

# Convert data into PyTorch DataLoader
train_dataset = QuestionPairDataset(train_pairs, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = QuestionPairDataset(val_pairs, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

test_dataset = QuestionPairDataset(test_pairs, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Initialize the neural network
input_size = 768  # Embedding size of the input questions
hidden_size = 64
model = FeedForwardNN(input_size*2, hidden_size)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = 0.0
    for batch in train_dataloader
    # Get the inputs and labels
    question1_embeddings = batch['question1']
    question2_embeddings = batch['question2']
    labels = batch['label'].float()
    # Zero the parameter gradients
    optimizer.zero_grad()

    # Forward pass
    inputs = torch.cat((question1_embeddings, question2_embeddings), dim=1)
    outputs = model(inputs)
    loss = criterion(outputs.squeeze(), labels)

    # Backward pass and optimize
    loss.backward()
    optimizer.step()

    # Accumulate loss
    train_loss += loss.item() * len(labels)

# Compute validation loss
val_loss = 0.0
with torch.no_grad():
    for batch in val_dataloader:
        # Get the inputs and labels
        question1_embeddings = batch['question1']
        question2_embeddings = batch['question2']
        labels = batch['label'].float()

        # Forward pass
        inputs = torch.cat((question1_embeddings, question2_embeddings), dim=1)
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)

        # Accumulate loss
        val_loss += loss.item() * len(labels)

# Compute average loss for the epoch
train_loss /= len(train_dataset)
val_loss /= len(val_dataset)

# Print progress
print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")

test_loss = 0.0
predictions = []
with torch.no_grad():
for batch in test_dataloader:
# Get the inputs and labels
question1_embeddings = batch['question1']
question2_embeddings = batch['question2']
labels = batch['label'].float()

    # Forward pass
    inputs = torch.cat((question1_embeddings, question2_embeddings), dim=1)
    outputs = model(inputs)
    loss = criterion(outputs.squeeze(), labels)

    # Accumulate loss and predictions
    test_loss += loss.item() * len(labels)
    predictions.extend(outputs.squeeze().tolist())

    test_loss /= len(test_dataset)
    predictions = [1 if p > 0.5 else 0 for p in predictions]
    correct = sum([1 if p == l else 0 for p, l in zip(predictions, test_labels)])
    accuracy = correct / len(test_labels)

    print(f"Test Loss: {test_loss:.4f} - Test Accuracy: {accuracy:.4f}")