In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.tokenize import sent_tokenize
import warnings

warnings.filterwarnings(action = 'ignore')

import gensim
from gensim.models import Word2Vec

In [3]:
import threading
from functools import cmp_to_key

# Word2Vec Algorithm
1. Combine all the interests of user A and user B into a large sentence.  
2. Run the Word2Vec model on the data set using the tokenized sentences.  
3. Then compare each sentence between the users, and only use the largest cosine similarity between any two sentenes.
4. Receive the mean similarity between the two users.
5. How large the mean similarity is determines whether user A and user B have similar interests.

In [4]:
def similarity(user_A, user_B):
  """
  Returns the cosine similarity between two users' interests.
  """

  """
  skipgram model because it's better at predicting context and better for smaller data sets
  https://stackoverflow.com/questions/38287772/cbow-v-s-skip-gram-why-invert-context-and-target-words
  """
  data = [user_A, user_B]
  model = gensim.models.Word2Vec(data, min_count = 1, size = 100, window = 5, sg = 1)

  total_similarity = 0

  for sentence_A in user_A:
    largest_similarity = -1

    # use the sentence pair with the largest similarity
    for sentence_B in user_B:
      similarity = model.similarity(sentence_A, sentence_B)
      if similarity > largest_similarity:
        largest_similarity = similarity

    total_similarity += largest_similarity

    # return the mean similarity between the two users
    return total_similarity / len(user_A)

# Multi-Threaded Process
We need to multi-thread the process of comparing user A with other users.

Then rank the users from highest similar (at start of list) to lowest similar (at end of list).

In [5]:
def similarity_matrix(user_A, *compare_users):
  """
  Multi-threaded process that returns the similarity of users with user A.
  """

  similarity_matrix = [0] * len(compare_users)

  def set_similarity(user_A, user_B, similarity_matrix, index):
    try:
      similarity_matrix[index] = { "index": index, "similarity": similarity(user_A, user_B) }
    except Exception as e:
      print(e)

  threads = []
  for index, compare_user in enumerate(compare_users):
    user_thread = threading.Thread(target=set_similarity, args=[user_A, compare_user, similarity_matrix, index])
    threads.append(user_thread)

  # start each thread
  for thread in threads:
    thread.start()

  # wait for each thread to finish
  for thread in threads:
    thread.join()

  def cmp_similarity(a, b):
    return 1 if a["similarity"] < b["similarity"] else -1

  return sorted(similarity_matrix, key=cmp_to_key(cmp_similarity))
  

In [6]:
user_A = ["I like to code.", "I like to hike.", "I like to cook."]
user_B = ["I like to program.", "I like to bike.", "I like to eat."]
user_C = ["Running", "Talking with people", "Movies"]
user_D = ["Sports", "Cooking", "Video Games"]

In [7]:
similarity_matrix(user_A, user_B, user_C, user_D)

[{'index': 2, 'similarity': 0.08555739124615987},
 {'index': 1, 'similarity': 0.03714261204004288},
 {'index': 0, 'similarity': 0.022247376541296642}]