Example of training fastText model and getting sentence embeddings

In [9]:
from gensim.models import FastText
from scipy import spatial


def get_sentence_embedding(model, sentence):
  # This method takes in the trained model and the input sentence
  # and returns the embedding of the sentence as the average embedding
  # of its words
  words = sentence.split(" ")
  vector = model.wv[words[0]]
  for i in range(1, len(words)):
    vector += model.wv[words[i]]
  return vector/len(words)


sampleTexts = ["This is example1", "This is example two", "This is example three"]
# There are parameters here that you should define
model = FastText(vector_size = 100, window = 5, min_n=1)
model.build_vocab(sampleTexts)

# training the model
model.train(sampleTexts, total_examples = len(sampleTexts), epochs = 10)

# saving the model in-case you need to reuse it
model.save("fastText.model")

vec1 = get_sentence_embedding(model, sampleTexts[0])
vec2 = get_sentence_embedding(model, sampleTexts[1])
vec3 = get_sentence_embedding(model, sampleTexts[2])

# calculating cosine similarity
result = 1 - spatial.distance.cosine(vec1, vec2)
print(result)

result = 1 - spatial.distance.cosine(vec1, vec3)
print(result)



0.7853913903236389
0.8745558857917786


Reading Law Stack Exchange Data

In [15]:
import csv
from post_parser_record import PostParserRecord
from gensim.models import FastText
import nltk
nltk.download('punkt')
import re
import numpy as np

def read_tsv_test_data(file_path):
  # Takes in the file path for test file and generate a dictionary
  # of question id as the key and the list of question ids similar to it
  # as value. It also returns the list of all question ids that have
  # at least one similar question
  dic_similar_questions = {}
  lst_all_test = []
  with open(file_path) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        question_id = int(row[0])
        lst_similar = list(map(int, row[1:]))
        dic_similar_questions[question_id] = lst_similar
        lst_all_test.append(question_id)
        lst_all_test.extend(lst_similar)
  return dic_similar_questions, lst_all_test


def train_model(lst_sentences):
  #model = None
  model = FastText(
        sentences=lst_sentences,
        vector_size=100,    # size of the word vectors
        window=5,    # window size for the skip-gram model
        min_count=5, # minimum count of words to include in the vocabulary
        sg=1,        # use skip-gram model
        workers=4    # number of worker threads to use
    )
  model.build_vocab(corpus_iterable=lst_sentences)
  # train the model
  model.train(
      corpus_iterable=lst_sentences,
      total_examples=len(lst_sentences),
      epochs=10
  )

  return model

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:

# separating out to save time
duplicate_file = "duplicate_questions.tsv"
post_file = "Posts_law.xml"
dic_similar_questions, lst_all_test = read_tsv_test_data(duplicate_file)
#print(dic_similar_questions)
post_reader = PostParserRecord(post_file)
lst_training_sentences = []
embeddings = {}
for question_id in post_reader.map_questions:
  if question_id in lst_all_test:
    continue
  question = post_reader.map_questions[question_id]
  title = question.title
  body = question.body
  # Collect sentences here
  processed_title = re.sub('<[^<]+?>', ' ', title)
  token_title = nltk.sent_tokenize(processed_title)
  processed_body = re.sub('<[^<]+?>', ' ', title)
  token_body = nltk.sent_tokenize(processed_body)

  lst_training_sentences.extend(token_title)
  lst_training_sentences.extend(token_body)

In [17]:
# train your model
model = train_model(lst_training_sentences)

# save model
model.save("fastText.model");



In [18]:
def main():
  # get embeddings for each sentence and average them
  title_embedding = np.zeros(100)
  body_embedding = np.zeros(100)
  for sentence in token_title:
      title_embedding += get_sentence_embedding(model, sentence)
  title_embedding /= len(token_title)
  for sentence in token_body:
      body_embedding += get_sentence_embedding(model, sentence)
  body_embedding /= len(token_body)

  embeddings[question_id] = np.concatenate((title_embedding, body_embedding))

  lst_answers = question.answers
  if lst_answers is not None:
    for answer in lst_answers:
      answer_body = answer.body
      # Collection sentences here
      answer_body = re.sub('<[^<]+?>', ' ', answer_body)
      # Tokenize the answer body
      answer_sents = nltk.word_tokenize(answer_body)
      # Add each sentence to the list
      lst_training_sentences.extend(answer_sents)
      
# use your model and calculate the cosine similarity between the questions
# save the question id with the highest cosine similarity
# finding Similar questions using fastText model
  for test_question_id in dic_similar_questions:
    test_question_embedding = embeddings[test_question_id]
    for similar_question_id in dic_similar_questions[test_question_id]:
      similar_question_embedding = embeddings[similar_question_id]
      similarity = 1 - spatial.distance.cosine(test_question_embedding, similar_question_embedding)
      print("Cosine Similarity between question", test_question_id, "and similar question", similar_question_id, ":", similarity)
          
  # finding Similar questions using fastText model
  total_p_1 = 0.0
  for test_question_id in dic_similar_questions:
    test_question = dic_similar_questions[test_question_id]['Question']
    expected_duplicate_id = dic_similar_questions[test_question_id]['DuplicateId']
    predicted_duplicate_id = model.wv.most_similar(test_question, topn=1)[0][0]
    if predicted_duplicate_id == expected_duplicate_id:
        total_p_1 += 1.0
    else:
        total_p_1 += 0.0
    dictionary_result[test_question_id] = predicted_duplicate_id

  # calculate average P@1
  num_test_questions = len(dic_similar_questions)
  avg_p_1 = total_p_1 / num_test_questions
  print("Average P@1: {:.4f}".format(avg_p_1))

main()

KeyError: ignored