In [None]:
!pip install -U gensim

Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.4 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [None]:
import pandas as pd
import numpy as np
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.svm import SVC
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
import multiprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import nltk

import itertools
from gensim.similarities import MatrixSimilarity
from gensim.parsing.preprocessing import preprocess_documents
from gensim.parsing.preprocessing import preprocess_string

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
data = pd.read_csv("jd.csv")
data

Unnamed: 0,id,category,jd
0,1,cloud engineer,Company is seeking a Data Architect. The indiv...
1,2,cloud engineer,"Cloud Architect Lowell, MA 6+ months Resear..."
2,3,cloud engineer,"Cloud Architect - AWSJob Type: Contract, Contr..."
3,4,cloud engineer,"Senior Software Architect (Perm)Charlestown, M..."
4,5,cloud engineer,"Our client, a well known and highly profitable..."
...,...,...,...
501,502,java developer,Synechron is a recognized leader and expert in...
502,503,java developer,Note: For further information please contact S...
503,504,java developer,"Synechron on behalf of our client, a global le..."
504,505,java developer,Please contactPatrick TrainorCanyon Associates...


In [None]:
def clean(text):
    text = re.sub('http\S+\s*', ' ', text)  # remove URLs
    text = re.sub('RT|cc', ' ', text)  # remove RT and cc
    text = re.sub('#\S+', '', text)  # remove hashtags
    text = re.sub('@\S+', '  ', text)  # remove mentions
    text = re.sub('[%s]' % re.escape("""!"$%&'()*,-/:;<=>?@[\]^_`{|}~"""), ' ', text)  # remove punctuations
    text = re.sub(r'[^\x00-\x7f]',r' ', text) 
    text = re.sub('\s+', ' ', text)  # remove extra whitespace
    return text

In [None]:
data['cleaned_jd'] = ""

#cleaning data
data['cleaned_jd'] = data.jd.apply(lambda x: clean(x))

# Removing the stop words
data['cleaned_jd'] = data.cleaned_jd.apply(lambda x: remove_stopwords(x))

# Tokenize the text column
data['cleaned_jd'] = [simple_preprocess(line, deacc=True) for line in data['cleaned_jd']] 

# Get the stemmed_tokens
porter_stemmer = PorterStemmer()
data['cleaned_jd'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in data['cleaned_jd'] ]

# get the lammatized tokens
lemmatizer = WordNetLemmatizer() 
data['cleaned_jd'] = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in data['cleaned_jd'] ]

In [None]:
data

Unnamed: 0,id,category,jd,cleaned_jd
0,1,cloud engineer,Company is seeking a Data Architect. The indiv...,"[compani, seek, data, architect, the, individu..."
1,2,cloud engineer,"Cloud Architect Lowell, MA 6+ months Resear...","[cloud, architect, lowel, ma, month, research,..."
2,3,cloud engineer,"Cloud Architect - AWSJob Type: Contract, Contr...","[cloud, architect, awsjob, type, contract, con..."
3,4,cloud engineer,"Senior Software Architect (Perm)Charlestown, M...","[senior, softwar, architect, perm, charlestown..."
4,5,cloud engineer,"Our client, a well known and highly profitable...","[our, client, known, highli, profit, softwar, ..."
...,...,...,...,...
501,502,java developer,Synechron is a recognized leader and expert in...,"[synechron, recogn, leader, expert, build, bus..."
502,503,java developer,Note: For further information please contact S...,"[note, for, inform, contact, sonal, hatila, so..."
503,504,java developer,"Synechron on behalf of our client, a global le...","[synechron, behalf, client, global, leader, di..."
504,505,java developer,Please contactPatrick TrainorCanyon Associates...,"[plea, contactpatrick, trainorcanyon, associ, ..."


In [None]:
dm = [0, 1]
vector_size = [200, 300, 400, 500, 600, 700]
window = [3, 5, 10]
epochs = [10, 20, 30]
paramsList = [{'dm': item[0],
               'vector_size': item[1],
               'window': item[2],
               'epochs' : item[3],
               } for item in
                 list(itertools.product(*[dm,
                                          vector_size,
                                          window,
                                          epochs]))
              ]

In [None]:
def evaluation_doc2vec(params, processed_corpus, top=10):
  tagged_corpus = [TaggedDocument(d, [i]) for i, d in enumerate(processed_corpus)]
  scoreList = []
  for param in params: 
    param['score'] = 0
    model = None
    try:
      model = Doc2Vec(tagged_corpus,
                      dm=param['dm'], 
                      vector_size=param['vector_size'], 
                      window=param['window'], 
                      min_count=1, 
                      epochs=param['epochs'], 
                      hs=1)
      for i in range(len(data)):
        new_doc = data['cleaned_jd'][i]
        test_doc_vector = model.infer_vector(new_doc)
        sims=model.docvecs.most_similar(positive=[test_doc_vector])
        topSims = sims[:10]
        for ii in range(len(topSims)):
          if (data['category'][i]) == (data['category'][topSims[ii][0]]):
            param['score'] = param['score'] + (top - ii)
      print(param)
      scoreList.append(param)
    except Exception as error:
      print(f'Cannot evaluate model with parameters {param} because of error: {error}')
      continue
  return scoreList

In [None]:
scoreList = evaluation_doc2vec(paramsList ,data['cleaned_jd'].values)
scoreList = pd.DataFrame(scoreList)
scoreList = scoreList.sort_values(by = ['score'], ascending = False)
print(scoreList)



{'dm': 0, 'vector_size': 200, 'window': 3, 'epochs': 10, 'score': 18617}
{'dm': 0, 'vector_size': 200, 'window': 3, 'epochs': 20, 'score': 17522}
{'dm': 0, 'vector_size': 200, 'window': 3, 'epochs': 30, 'score': 16796}
{'dm': 0, 'vector_size': 200, 'window': 5, 'epochs': 10, 'score': 18575}
{'dm': 0, 'vector_size': 200, 'window': 5, 'epochs': 20, 'score': 17574}
{'dm': 0, 'vector_size': 200, 'window': 5, 'epochs': 30, 'score': 16836}
{'dm': 0, 'vector_size': 200, 'window': 10, 'epochs': 10, 'score': 18580}
{'dm': 0, 'vector_size': 200, 'window': 10, 'epochs': 20, 'score': 17522}
{'dm': 0, 'vector_size': 200, 'window': 10, 'epochs': 30, 'score': 16718}
{'dm': 0, 'vector_size': 300, 'window': 3, 'epochs': 10, 'score': 18533}
{'dm': 0, 'vector_size': 300, 'window': 3, 'epochs': 20, 'score': 17716}
{'dm': 0, 'vector_size': 300, 'window': 3, 'epochs': 30, 'score': 16908}
{'dm': 0, 'vector_size': 300, 'window': 5, 'epochs': 10, 'score': 18547}
{'dm': 0, 'vector_size': 300, 'window': 5, 'epoc

In [None]:
scoreList.to_csv('./jd_tuning.csv')