In [None]:
!pip install -U gensim



In [None]:
import pandas as pd
import numpy as np
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.svm import SVC
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
import multiprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import nltk

import itertools
from gensim.similarities import MatrixSimilarity
from gensim.parsing.preprocessing import preprocess_documents
from gensim.parsing.preprocessing import preprocess_string

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
resumeDf = pd.read_csv("Resume.csv")
resumeDf

Unnamed: 0,id,category,resume
0,1,software developer,SOFTWARE DEVELOPER INTERN ...
1,2,software developer,SOFTWARE DEVELOPER Professional...
2,3,software developer,GIS ANALYST/SOFTWARE DEVELOPER ...
3,4,software developer,SENIOR SOFTWARE DEVELOPER Caree...
4,5,software developer,SENIOR SOFTWARE DEVELOPER Summa...
...,...,...,...
436,545,java developer,GRADUATE RESEARCH ASSISTANT Pro...
437,546,java developer,GRADUATE RESEARCH ASSISTANT ...
438,547,java developer,STUDENT ASSISTANT Qualificati...
439,548,java developer,YW TEAM LEAD Summary ...


In [None]:
def clean(text):
    text = re.sub('http\S+\s*', ' ', text)  # remove URLs
    text = re.sub('RT|cc', ' ', text)  # remove RT and cc
    text = re.sub('#\S+', '', text)  # remove hashtags
    text = re.sub('@\S+', '  ', text)  # remove mentions
    text = re.sub('[%s]' % re.escape("""!"$%&'()*,-/:;<=>?@[\]^_`{|}~"""), ' ', text)  # remove punctuations
    text = re.sub(r'[^\x00-\x7f]',r' ', text) 
    text = re.sub('\s+', ' ', text)  # remove extra whitespace
    return text

In [None]:
resumeDf['cleaned_resume'] = ""

#cleaning data
resumeDf['cleaned_resume'] = resumeDf.resume.apply(lambda x: clean(x))

# Removing the stop words
resumeDf['cleaned_resume'] = resumeDf.cleaned_resume.apply(lambda x: remove_stopwords(x))

# Tokenize the text column
resumeDf['cleaned_resume'] = [simple_preprocess(line, deacc=True) for line in resumeDf['cleaned_resume']] 

# Get the stemmed_tokens
porter_stemmer = PorterStemmer()
resumeDf['cleaned_resume'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in resumeDf['cleaned_resume'] ]

# get the lammatized tokens
lemmatizer = WordNetLemmatizer() 
resumeDf['cleaned_resume'] = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in resumeDf['cleaned_resume'] ]

In [None]:
resumeDf

Unnamed: 0,id,category,resume,cleaned_resume
0,1,software developer,SOFTWARE DEVELOPER INTERN ...,"[softwar, develop, intern, summari, skill, pro..."
1,2,software developer,SOFTWARE DEVELOPER Professional...,"[softwar, develop, profession, summari, to, ob..."
2,3,software developer,GIS ANALYST/SOFTWARE DEVELOPER ...,"[gi, analyst, softwar, develop, summari, to, c..."
3,4,software developer,SENIOR SOFTWARE DEVELOPER Caree...,"[senior, softwar, develop, career, focu, to, w..."
4,5,software developer,SENIOR SOFTWARE DEVELOPER Summa...,"[senior, softwar, develop, summari, oracl, cer..."
...,...,...,...,...
436,545,java developer,GRADUATE RESEARCH ASSISTANT Pro...,"[graduat, research, assist, profession, profil..."
437,546,java developer,GRADUATE RESEARCH ASSISTANT ...,"[graduat, research, assist, summari, to, obtai..."
438,547,java developer,STUDENT ASSISTANT Qualificati...,"[student, assist, qualif, algorithm, java, sof..."
439,548,java developer,YW TEAM LEAD Summary ...,"[yw, team, lead, summari, motiv, team, lead, p..."


In [None]:
dm = [0, 1]
vector_size = [200, 300, 400, 500, 600, 700]
window = [3, 5, 10]
epochs = [10, 20, 30]
paramsList = [{'dm': item[0],
               'vector_size': item[1],
               'window': item[2],
               'epochs' : item[3],
               } for item in
                 list(itertools.product(*[dm,
                                          vector_size,
                                          window,
                                          epochs]))
              ]

In [None]:
def evaluation_doc2vec(params, processed_corpus, top=10):
  tagged_corpus = [TaggedDocument(d, [i]) for i, d in enumerate(processed_corpus)]
  scoreList = []
  for param in params:
    param['score'] = 0
    model = None
    try:
      model = Doc2Vec(tagged_corpus,
                      dm=param['dm'], 
                      vector_size=param['vector_size'], 
                      window=param['window'], 
                      min_count=1, 
                      epochs=param['epochs'], 
                      hs=1)
      for i in range(len(resumeDf)):
        new_doc = resumeDf['cleaned_resume'][i]
        test_doc_vector = model.infer_vector(new_doc)
        sims=model.docvecs.most_similar(positive=[test_doc_vector])
        topSims = sims[:10]
        for ii in range(len(topSims)):
          if (resumeDf['category'][i]) == (resumeDf['category'][topSims[ii][0]]):
            param['score'] = param['score'] + (top - ii)
      print(param)
      scoreList.append(param)
    except Exception as error:
      print(f'Cannot evaluate model with parameters {param} because of error: {error}')
      continue
  return scoreList

In [None]:
scoreList = evaluation_doc2vec(paramsList ,resumeDf['cleaned_resume'].values)
scoreList = pd.DataFrame(scoreList)
scoreList = scoreList.sort_values(by = ['score'], ascending = False)
print(scoreList)



{'dm': 0, 'vector_size': 200, 'window': 3, 'epochs': 10, 'score': 13687}
{'dm': 0, 'vector_size': 200, 'window': 3, 'epochs': 20, 'score': 12864}
{'dm': 0, 'vector_size': 200, 'window': 3, 'epochs': 30, 'score': 12528}
{'dm': 0, 'vector_size': 200, 'window': 5, 'epochs': 10, 'score': 13625}
{'dm': 0, 'vector_size': 200, 'window': 5, 'epochs': 20, 'score': 12870}
{'dm': 0, 'vector_size': 200, 'window': 5, 'epochs': 30, 'score': 12545}
{'dm': 0, 'vector_size': 200, 'window': 10, 'epochs': 10, 'score': 13540}
{'dm': 0, 'vector_size': 200, 'window': 10, 'epochs': 20, 'score': 12851}
{'dm': 0, 'vector_size': 200, 'window': 10, 'epochs': 30, 'score': 12662}
{'dm': 0, 'vector_size': 300, 'window': 3, 'epochs': 10, 'score': 13647}
{'dm': 0, 'vector_size': 300, 'window': 3, 'epochs': 20, 'score': 12707}
{'dm': 0, 'vector_size': 300, 'window': 3, 'epochs': 30, 'score': 12743}
{'dm': 0, 'vector_size': 300, 'window': 5, 'epochs': 10, 'score': 13592}
{'dm': 0, 'vector_size': 300, 'window': 5, 'epoc

In [None]:
scoreList.to_csv('./resume_tuning.csv')