In [59]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [60]:
data = pd.read_csv('../input/potential-talents/potential-talents - Aspiring human resources - seeking human resources.csv')

In [61]:
data['input'] = "Aspiring human resources seeking human resources"

In [62]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import re 
import string 
def removestopwords(text):
    stop_words = set(stopwords.words('english'))
#     print(stop_words)
    text = re.sub(r'[^\w\s]', " ", text)
    text  = [word for word in text.split() if word not in stop_words]
    
    return text    

def lemmatization(text):
    words = removestopwords(text)
#     print(words)
#     words = word_tokenize(words)
#     word_tagged = pos_tag(words)
#     print( word_tagged)
    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(word) for word in words])
    return text

In [63]:
import nltk
nltk.download('omw-1.4')

In [64]:
data['job_title'] = data['job_title'].apply(lambda x: lemmatization(x))
data['job_title'] 

# **word2vec**

In [65]:
import nltk
import gensim
from gensim.models import Word2Vec
tokens = data['job_title'].apply(lambda x: nltk.word_tokenize(x))
w2v_model = Word2Vec(tokens,
                     min_count=1,
                     window=10,
                     vector_size=250,
                     alpha=0.03, 
                     min_alpha=0.0007,
                     workers = 4,
                     seed = 42)

In [66]:
from scipy import spatial
    
indextokey_set = set(w2v_model.wv.index_to_key)
def avg_feature_vector(sentence, w2v_model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
            if word in index2word_set:
                n_words += 1
                feature_vec = np.add(feature_vec, w2v_model.wv[word])
                
            if (n_words > 0):
                feature_vec = np.divide(feature_vec, n_words)
    return feature_vec  

In [67]:
score1 = []
for sentence in data['job_title']:
    s1_afv = avg_feature_vector(sentence, w2v_model, num_features=250, index2word_set=indextokey_set)     
    score1.append(s1_afv)

In [68]:
score2 = []
for sentence in data['input']:
    s2_afv = [avg_feature_vector(sentence, w2v_model, num_features=250, index2word_set=indextokey_set)]
    score2.append(s2_afv)

In [69]:
# the output is Cosine simlilarity with word2vec word embedding 
sim_word2vec = []
for i in range(len(score1)):
    sim = 1 - spatial.distance.cosine(score1[i],score2[i])
    sim_word2vec.append(sim)
print(sim_word2vec)

# **TFIDF**

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

In [71]:
# the output is Cosine simlilarity with tfidf vectorizer 
sim_tfidf = []
for i in range(len(data['job_title'])):
    score = cosine_sim(data['job_title'][i], data['input'][i])
    sim_tfidf.append(score)
print(sim_tfidf)

# **Glove**

In [72]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip

In [73]:
GLOVE_EMB = '../input/glove6b300dtxt/glove.6B.300d.txt'
EMBEDDING_DIM = 300

In [74]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['job_title'])

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)
word_index.keys()

In [75]:
embeddings_index = {}

f = open(GLOVE_EMB)
for line in f:
  values = line.split()
  word = value = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' %len(embeddings_index))

In [76]:
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [77]:
indextokey_set = set(word_index.keys())
print(indextokey_set)


In [78]:
def avg_feature_vector_glove(sentence,embedding_matrix, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
            if word in index2word_set:
                n_words += 1
                feature_vec = np.add(feature_vec,embedding_matrix[word_index[word]]) 
            if (n_words > 0):
                feature_vec = np.divide(feature_vec, n_words)
    return feature_vec  

In [79]:
score_1 = []
for sentence in data['job_title']:
    s1_afv = avg_feature_vector_glove(sentence,embedding_matrix,num_features=300, index2word_set=indextokey_set)     
    score_1.append(s1_afv)
print(len(score_1))
    

In [80]:
score_2 = []
for sentence in data['input']:
    s1_afv = avg_feature_vector_glove(sentence,embedding_matrix, num_features=300, index2word_set=indextokey_set)     
    score_2.append(s1_afv)


In [81]:
# the output is Cosine simlilarity with word2vec word embedding 
sim_glove = []
for i in range(len(score1)):
    sim = 1 - spatial.distance.cosine(score_1[i],score_2[i])
    sim_glove.append(sim)
print(sim_glove)

# **Cosine Similarity**

In [82]:
# Cosine Similarity with word2vec, tfidf, and glove embedding 
result_similarity = []
for i in range(len(sim_word2vec)):
    score_mean = (sim_word2vec[i]+sim_tfidf[i]+sim_glove[i])/3
    result_similarity.append(score_mean)
print(result_similarity)

In [83]:
data['result_similarity'] = result_similarity

In [84]:
# cut the connection variable as 1,2,3,4, the larger the higher rank 
data['connection'] = data['connection'].str.split('+').str[0]
data['connection'] = data['connection'].astype(int)
list = data['connection'].tolist()
list.sort()
# print(list)
data['connection'] = pd.cut(data['connection'],[0,40,200,499,600],labels=[1,2,3,4]) 
data['connection'] = data['connection'].astype('int')

In [85]:
# one hot encoding location variable
dum_key = pd.get_dummies(data['location'])
data = data.drop('location', 1)
data = pd.concat([data,dum_key],axis=1)
data.head(5)

In [86]:
# data[data['result_similarity'].astype(float)>0.5] ='1'
# data[data['result_similarity'].astype(float)<0.5] ='0'

In [87]:

# set result_similarity column as label
y = data['result_similarity']
x = data.drop(['id','fit','job_title','input','result_similarity'], axis=1)


In [93]:
y

In [94]:
"""
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split( x, y,test_size=0.7, random_state=26)
"""

In [95]:
"""
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(x_test)
"""

In [98]:
import random
def Starcandidate(n,similarity_score):
    # as do not know the standard of starring candidate, suppose I random select some candidate
    # n is the number of candidate who you want to star 
    candidate_list = similarity_score.tolist()
    star_list = random.sample(candidate_list,n)
    
    # delete star_list from candidate_list
    reminding_list = []
    for item in candidate_list:
        if item not in star_list:
            reminding_list.append(item)
#     print(reminding_list)
            
    # label star_list as 1
    star_list_label = []
    for i in range(len(star_list)):
        star_list[i] = 1
        star_list_label.append(star_list[i])
#     print(star_list_label)

   # label the reminding list. If a similarity score of >=0.5 as 1 and remaining with similarity score <0.5 as 0
    reminding_list_label =[]
    for i in range(len(reminding_list)):
        if reminding_list[i] >= 0.5:
            reminding_list[i] = 1
            reminding_list_label.append(reminding_list[i])
        elif reminding_list[i] <= 0.5:
            reminding_list[i] = 0
            reminding_list_label.append(reminding_list[i])
        else: None
    return  reminding_list_label
             
        
        
    
        

In [100]:
Starcandidate(5,y)