In [1]:
import numpy as np
import pandas as pd
import re
import spacy
import pymysql

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

import pickle
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
# MySQL Database connection

from sqlalchemy import create_engine

# Creating engine which connect to MySQL
user = 'root' # user name
pw = 'Root123456' # password
db = 'hispeedhr' # database

In [3]:
# creating engine to connect database
engine = create_engine(f"mysql+pymysql://{user}:{pw}@localhost/{db}")

In [4]:
# loading data from database
job_description_sql = 'select * from job_description_table'
resume_sql = 'select * from resume_table'

In [5]:
# Read job_description_df and resume_df using pd.read_sql_query
job_description_df = pd.read_sql_query(job_description_sql, con=engine)
resume_df = pd.read_sql_query(resume_sql, con=engine)

# Select only the required columns
job_description_df = job_description_df[["Category", "Job_desc_raw"]]

In [6]:
resume_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  33 non-null     object
 1   Resume    33 non-null     object
dtypes: object(2)
memory usage: 656.0+ bytes


In [7]:
job_description_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Category      84 non-null     object
 1   Job_desc_raw  84 non-null     object
dtypes: object(2)
memory usage: 1.4+ KB


In [8]:
# Define functions for data processing

def cleanRawText(rawText):
    rawText = str(rawText)
    rawText = re.sub('http\S+\s*', ' ', rawText)
    rawText = re.sub('RT|cc', ' ', rawText)
    rawText = re.sub('#\S+', '', rawText)
    rawText = re.sub('@\S+', '  ', rawText)
    rawText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', rawText)
    rawText = re.sub(r'[^\x00-\x7f]', r' ', rawText) 
    rawText = re.sub('\s+', ' ', rawText)
    rawText = re.sub('Job Description', '', rawText)
    return rawText

def remove_stop_words(text):
    if isinstance(text, str):
        doc = nlp(text)
        filtered_text = ' '.join([token.text for token in doc if not token.is_stop])
        return filtered_text
    else:
        return ''

def extract_entities(text):
    doc = nlp(text)
    named_entities = list(set([ent.text for ent in doc.ents]))
    return named_entities

def remove_words(text, words):
    pattern = r'\b(?:{})\b'.format('|'.join(map(re.escape, words)))
    return re.sub(pattern, '', text)

def cleanResumeData(df):
    df["Resume"] = df["Resume"].apply(lambda x: x.strip())
    cleaned_resume = df["Resume"].apply(cleanRawText)
    df["cleaned_text"] = cleaned_resume
    return df

def cleanJDData(df):
    df["Job_desc_raw"] = df["Job_desc_raw"].apply(lambda x: x.strip())
    cleaned_jd = df["Job_desc_raw"].apply(cleanRawText)
    df["cleaned_text"] = cleaned_jd
    return df

In [9]:
# Load the English language model in spaCy
nlp = spacy.load('en_core_web_sm')

In [10]:
# Clean the resume and job description data
resume_df = cleanResumeData(resume_df)
job_description_df = cleanJDData(job_description_df)

In [11]:
resume_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Category      33 non-null     object
 1   Resume        33 non-null     object
 2   cleaned_text  33 non-null     object
dtypes: object(3)
memory usage: 920.0+ bytes


In [12]:
job_description_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Category      84 non-null     object
 1   Job_desc_raw  84 non-null     object
 2   cleaned_text  84 non-null     object
dtypes: object(3)
memory usage: 2.1+ KB


In [13]:
# Apply the remove_stop_words function to each value in the 'cleaned_text' column
job_description_df['cleaned_text_no_stopwords'] = job_description_df['cleaned_text'].apply(remove_stop_words)
resume_df['cleaned_text_no_stopwords'] = resume_df['cleaned_text'].apply(remove_stop_words)

# Apply the extract_entities function to each value in the 'cleaned_text' column
job_description_df['named_entities'] = job_description_df['cleaned_text'].apply(extract_entities)
resume_df['named_entities'] = resume_df['cleaned_text'].apply(extract_entities)

# Apply the remove_words function to each row in the job_description_df and resume_df
job_description_df['cleaned_text_no_ne'] = job_description_df.apply(lambda row: remove_words(row['cleaned_text'], row['named_entities']), axis=1)
resume_df['cleaned_text_no_ne'] = resume_df.apply(lambda row: remove_words(row['cleaned_text'], row['named_entities']), axis=1)

# Combine the resume and job description dataframes
category_text_combined = pd.concat([resume_df[["Category", "cleaned_text_no_stopwords"]],
                                    job_description_df[["Category", "cleaned_text_no_stopwords"]]], ignore_index=True)

In [14]:
category_text_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 2 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Category                   117 non-null    object
 1   cleaned_text_no_stopwords  117 non-null    object
dtypes: object(2)
memory usage: 2.0+ KB


In [15]:
# Tokenize and tag the text
full_docs = [TaggedDocument(doc.split(' '), [i])
             for i, doc in enumerate(category_text_combined.cleaned_text_no_stopwords)]

# Instantiate the Doc2Vec model
model = Doc2Vec(vector_size=32, window=2, min_count=1, workers=8, epochs=40)

# Build the vocabulary and train the model
model.build_vocab(full_docs)
model.train(full_docs, total_examples=model.corpus_count, epochs=model.epochs)

# Save the trained model using pickle
with open('doc2vec_model.pkl', 'wb') as file:
    pickle.dump(model, file)