The following two cells are required to download and load the model in to the system.

In [33]:
!gzip -d GoogleNews-vectors-negative300.bin.gz

In [35]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2021-07-08 19:44:48--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.88.134
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.88.134|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2021-07-08 19:45:14 (60.4 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [None]:
# this function works without ML model
def getWordsAssiciatedWithUser(userName, returnTopThree = False):
  """
  Input:
    - UserName or the userID in the data base
    - returnTopthree (true or false)
  Output:
    - if only one word is required, return the most frequent word associated with a user
    - if top three words are required, return the top three words associated with a profile
  functionality:
    - Access the database to get the user details
    - count the words associated with a user profile 
    - return the most frequent word associated with a user profile
  """
  #imports
  from collections import Counter


  #Currently using a dummy dataset
  userDictionary  = {"Alex":'hard working,dedicated,responsible,dedicated','Tim':'hard working,dedicated,responsible,dedicated'}
  

  ''' The database 
        - we can use this as the data storage format in the database 
        - we can have the userID associated with the words
        - while entering the data, we can ensure this format follows
        - whenver the new data is added, we can append the new value with a comma (,) to the existing values
  '''
  #To Add: The database connection 

  #To Add: The data retrieval logic for a specific user

  #Optional: We can use the NLTK to remove the stopwords, but since, we get only single words, it may not be required

  #Once we access the data
  currUserListOfWords = Counter(list(userDictionary[userName].split(",")))

  # checking if more than one words for a profile are required

  if returnTopThree:
    #gets the top three most frequent words associated with a profile
    wordsFrequency = list(currUserListOfWords.items())
    wordsFrequency.sort(key=lambda x: x[1])
    topThreeWords = ' '.join([i[0] for i in wordsFrequency[:3]])
    return topThreeWords
  else:
    #returns the top most word associated with a profile
    return max(currUserListOfWords)



In [None]:
getWordsAssiciatedWithUser('Alex')

'responsible'

The following cells describe various functions used to get the word asssociated with a profile, using a trained word2vec model

In [4]:
def getCleanWord(word,model):
  """
  Input:
    - word for which the embeddings are required
  Output:
    - punctuation removed word if it is present in model vocab; else None
  functionality:
    - removes any stop words
    - spelling correction 
    - if the word is present in model vocabolary, return it
  """
  import re
  word = ''.join(list(re.sub(r'[^\w\s]','',word).split(" ")))
  if word in model.vocab:
    return word
  else:
    #spelling correction can be implemented here
    return None


In [None]:
def loadModel(modelBinaryPath = 'GoogleNews-vectors-negative300.bin.gz'):
  """
  Input:
   - file path of the model
  Output:
   - Model loaded from the file path
  """
  import gensim
  try:
    model = gensim.models.KeyedVectors.load_word2vec_format(modelBinaryPath, binary = True)
    return model
  except:
    return None

In [None]:
def getSimilarityScore(word1, word2, model):
  """
  Input:
    - both the words between whom the similarity is to be checked.
    - WOrd2vec model to obtain the embedding of both the words.
  Output:
    - similarity score between both the words.
  """
  if not word1 in model.vocab:
    word1 = getCleanWord(word1,model)
  if not word2 in model.vocab:
    word2 = getCleanWord(word2,model)

  if word1 == None or word2 == None:
    return 0
  else:
    return model.similarity(word1,word2)


In [None]:
def getWordsAsscoiatedWithTheUser(User,model):
  """
  Input:
    - UserName or the userID in the data base,
    - Word2vec model 
  Output:
    - if only one word is required, return the most frequent word associated with a user
  functionality:
    - Access the database to get the user details
    - count the words associated with a user profile 
    - return the most frequent word associated with a user profile based on word similarity
  """
  from collections import Counter
  #Currently using a dummy dataset
  userDictionary  = {"Alex":'hard working,dedicated,responsible,dedicated','Tim':'hard working,dedicated,responsible,dedicated'}

  #To Add: The database connection 

  #To Add: The data retrieval logic for a specific user

  #Once we access the data
  model = loadModel()
  currUserListOfWords = list(userDictionary[User].split(","))
  wordfrequencies = Counter(currUserListOfWords)
  currUserListOfWords = list(set(currUserListOfWords))
  succesiveUserlistOfWords = iter(currUserListOfWords)
  next(succesiveUserlistOfWords)
  simScoreList = []
  for i in range(len(currUserListOfWords)):
    for j in range(len(succesiveUserlistOfWords)):
      
      simScore = getSimilarityScore(currUserListOfWords[i],succesiveUserlistOfWords[j],model)
      if simScore > 0.4:
        wordfrequencies[currUserListOfWords[i]] += 1

  return max(wordfrequencies)
