In [1]:
import pandas as pd 
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Data Loading

In [11]:
new_keywords = pd.read_csv('/content/drive/MyDrive/nearest-phrases/new-keywords (1).csv')
app_topics = pd.read_csv('/content/drive/MyDrive/nearest-phrases/list-of-approved-topics (1).csv',header=None)

In [5]:
new_keywords.head()

Unnamed: 0,Name
0,storytelling interview questions
1,sales certifications
2,consultant career
3,investigator cover letter
4,hard lines vs. soft lines


In [12]:
app_topics.head()

Unnamed: 0,0
0,how to write a vision statement
1,performance evaluation comments
2,team leader qualities
3,creative jobs
4,employee performance review template


#Data Cleaning

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer() 
stop_words = stopwords.words('english')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [18]:

def initial_clean(text):
  '''
  function to lower and tokenize the text
  '''

  text = text.lower()
  text = nltk.word_tokenize(text)
  return text


def remove_stop_words(text):
  '''
   function to remove stop_words
  '''
  return [word for word in text if word not in stop_words]


  
def stem_words(text):
  '''
   function to lemmatize 

  '''

  try:
      text = [lemmatizer.lemmatize(word) for word in text]
      text = [word for word in text if len(word) > 1] 
  except IndexError:
      pass
  return text


def apply_all(text):
  
  return stem_words(remove_stop_words(initial_clean(text)))


In [21]:
df = pd.DataFrame()
df['tokenized'] = new_keywords['Name'].apply(apply_all) 
d = app_topics[0].apply(apply_all) 


In [22]:
#combining and adding both the data to make corpus
new_df = pd.DataFrame()
new_df['tokenized']=d
com = pd.concat([df, new_df], ignore_index=True)

Unnamed: 0,tokenized
0,"[storytelling, interview, question]"
1,"[sale, certification]"
2,"[consultant, career]"
3,"[investigator, cover, letter]"
4,"[hard, line, vs., soft, line]"
...,...
222,"[become, process, server]"
223,"[become, video, editor]"
224,"[become, immigration, lawyer]"
225,"[become, medical, esthetician]"


#Model Training

## Universal Sentence Encoder

Universal Sentence Encoder is sentence embedding techniques that has been proposed by Google.
sentence embeddings we generate can be used for multiple tasks like sentiment analysis, text classification, sentence similarity, etc

This encoder is based on two encoder models and we can use either of the two
* Transformer
*Deep Averaging Network(DAN). 

## basic flow:

1. Tokenize the sentences after converting them to lowercase
2. Depending on the type of encoder, the sentence gets converted to a 512-dimensional vector
If we use the transformer, it is similar to the encoder module of the transformer architecture and uses the self-attention mechanism.
The DAN option computes the unigram and bigram embeddings first and then averages them to get a single embedding. This is then passed to a deep neural network to get a final sentence embedding of 512 dimensions.
3. These sentence embeddings are then used for various unsupervised and supervised tasks like Skipthoughts, NLI, etc. The trained model is then again reused to generate a new 512 dimension sentence embedding.

In [None]:
!pip3 install --upgrade tensorflow-gpu
# Install TF-Hub.
!pip3 install tensorflow-hub

The model is available to us via the TFHub.

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load(module_url)
print ("module %s loaded" % module_url)

we will generate embeddings for our sentence list as well as for our query.

In [27]:
#make sentence corpus 
s1 = app_topics[0].apply(apply_all).values

In [None]:
sentence_embeddings = model(s1)


In [None]:
import tqdm
from tqdm.notebook import tqdm

finding similarity using cosine similarity

In [None]:
for query in tqdm(new_keywords['Name'].values):
  print(query)
  if query in d:
    continue
  else:
    query_vec = model([query])[0]
    r =[]
    for sent in s1:
    
      sim = cosine(query_vec, model([sent])[0])
      r.append((sent,sim))
      r.sort(key = lambda x: x[1])
      r = r[-5:]

    d[query] = r


#create dataframe of similar topics

In [None]:
final_df = pd.DataFrame(d.items())
final_df.columns = ['keyword','approved_topics']
final_df['approved_topics']=final_df['approved_topics'].apply(lambda x: "| ".join([i[0] for i in x]))

In [None]:
final_df.to_csv('approved_topics.csv', index=False)