In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

In [4]:
module_url='../tfhub_modules/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47'

In [5]:
#Flatten lists of list to a single list
flatten = lambda l: [item for sublist in l for item in sublist]

In [6]:
#Sentence Embedding Module
embed=hub.Module(module_url)

Instructions for updating:
Colocations handled automatically by placer.


In [7]:
#Sentence Embedding
def get_features(messages):
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed(messages))
        print(message_embeddings.shape)
        return message_embeddings

#Similarity b/w sentence Vectors
def cosine_similarity(v1, v2):
    mag1 = np.linalg.norm(v1)
    mag2 = np.linalg.norm(v2)
    if (not mag1) or (not mag2):
        return 0
    return np.dot(v1, v2) / (mag1 * mag2)

#Semanticity Matching   
def semantic_search(text, data, vectors):
    print("Extracting features...")
    query_vec = get_features(text)
    print(query_vec.shape)
    res = []
    for i, d in enumerate(data):
        qvec = vectors[i].ravel()
        sim = cosine_similarity(query_vec, qvec)
        res.append((sim, d, i))
    return sorted(res, key=lambda x : x[0], reverse=True)

base_dir=u'/Users/kumarshubham/Desktop/1234'
print(os.listdir(base_dir))

skills=pd.read_csv(os.path.join(base_dir,u'resume_skills.csv'),header=None)[4].values
skills=[x.replace(',','\n').replace('\xe2\x80\xa2','').replace('\t','').replace('\xef\x83\x98','').replace('.','\n').replace('\r','\n').split('\n') for x in skills]
skills=np.unique(flatten(skills))[2:]
SkillVectors=get_features(skills)

for i,j in enumerate(skills):
    print(i,j)

In [1]:
messages=['C++','python','dance','singing','java']

In [10]:
MessageEmbedding=get_features(messages)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
(5, 512)


In [13]:
cosine_similarity(MessageEmbedding[2],MessageEmbedding[1])

0.199968

In [14]:
semantic_search(["Good in programming"],messages,MessageEmbedding)

Extracting features...
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
(1, 512)
(1, 512)


[(array([0.665367], dtype=float32), 'C++', 0),
 (array([0.5668749], dtype=float32), 'python', 1),
 (array([0.5355615], dtype=float32), 'java', 4),
 (array([0.170664], dtype=float32), 'singing', 3),
 (array([0.16922483], dtype=float32), 'dance', 2)]

In [36]:
semantic_search(["personality"],messages,MessageEmbedding)

Extracting features...
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
(1, 512)
(1, 512)


[(array([0.2978276], dtype=float32), 'dance', 2),
 (array([0.28717256], dtype=float32), 'singing', 3),
 (array([0.24003111], dtype=float32), 'C++', 0),
 (array([0.20734593], dtype=float32), 'python', 1)]

In [37]:
semantic_search(["script"],messages,MessageEmbedding)

Extracting features...
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
(1, 512)
(1, 512)


[(array([0.47211316], dtype=float32), 'python', 1),
 (array([0.4638889], dtype=float32), 'C++', 0),
 (array([0.28613144], dtype=float32), 'singing', 3),
 (array([0.26495633], dtype=float32), 'dance', 2)]

In [None]:
skills[277]

In [16]:
from sklearn.cluster import KMeans
kmeans=KMeans(n_clusters=2,random_state=0).fit(MessageEmbedding)
Clusters=kmeans.labels_
ClusterCenters=kmeans.cluster_centers_

In [25]:
def Predict_kmeans(texts,kmeans):
    embeddings=get_features(texts)
    return kmeans.predict(embeddings)

In [17]:
Clusters

array([0, 0, 1, 1], dtype=int32)

In [26]:
Predict_kmeans(['java'],kmeans)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
(1, 512)


array([0], dtype=int32)