In [1]:
import pandas as pd
import numpy as np
import nltk

nltk.download("wordnet")
import re
from bs4 import BeautifulSoup
import contractions

[nltk_data] Downloading package wordnet to /home/sujay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data

In [2]:
data = {"resume": []}
df = pd.DataFrame(data)

In [6]:
# Iterate through a folder of text files and extract the text
import os
import codecs

# Define the directory
dir_path = "./resumes_corpus"

# Get a list of all files in the directory
files = os.listdir(dir_path)

# Iterate over each file
for file in files:
    if file.endswith('.txt'):
        # Construct full file path
        file_path = os.path.join(dir_path, file)
        # Open the file
        with codecs.open(file_path, "r", encoding="utf8", errors='ignore') as f:
            # Read the file's contents
            content = f.read()

            new_row_df = pd.DataFrame(
                {"resume": [content]}
            )

            # Add the new row to the DataFrame
            df = pd.concat([df, new_row_df], ignore_index=True)

In [8]:
def cleaning(row):
    resume = row["resume"]
    # Lower case
    resume = resume.lower()
    # Remove html tags
    resume = re.sub(r"<.*?>", "", resume)
    # Remove urls
    resume = re.sub(r"http[s]?://\S+", "", resume)
    # Remove non alphanumeric characters
    resume = re.sub(r"[^a-zA-Z\s]", "", resume)
    # Remove extra whitespaces
    resume = re.sub(r"[\s]+", " ", resume)
    # Remove contractions
    resume = contractions.fix(resume)

    return resume

cleaned_resume_df = df.copy(deep=True)
cleaned_resume_df["resume"] = cleaned_resume_df.apply(cleaning, axis=1)

# Basic Embeddings

In [10]:
import gensim
import gensim.downloader as api
from gensim import utils, matutils

In [11]:
def similarity_score(w1, w2):
    return np.dot(matutils.unitvec(w1), matutils.unitvec(w2))

In [12]:
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for r in cleaned_resume_df["resume"]:
            # assume there's one document per line, tokens separated by whitespace
            yield r.split()


resumes = MyCorpus()
resume_wv_model = gensim.models.Word2Vec(
    sentences=resumes, min_count=10, vector_size=300, window=11
)
resume_wv = resume_wv_model.wv
google_wv = api.load("word2vec-google-news-300")

In [14]:
print(google_wv.most_similar(positive=["python"], topn=5))
print(resume_wv.most_similar(positive=["python"], topn=5))

[('pythons', 0.66883784532547), ('Burmese_python', 0.6680365800857544), ('snake', 0.6606292724609375), ('crocodile', 0.6591362953186035), ('boa_constrictor', 0.6443520188331604)]
[('pythondjango', 0.49119943380355835), ('pandas', 0.48099473118782043), ('usingpython', 0.43581053614616394), ('pythons', 0.43535345792770386), ('jinja', 0.418903112411499)]


In [26]:
print(similarity_score(google_wv["javascript"], google_wv["java"]))
print(similarity_score(resume_wv["javascript"], resume_wv["java"]))

0.33437246
0.32137614
