## **Building Functions**

In [1]:
# Matches repeated A (More than 1) and subs with a space
def remove_repeated_letters(text):
    pattern = r'(A)\1+'  
    return re.sub(pattern, ' ', text)


def sum_tuples(lists):
    result = {}

    # Iterate over each list of tuples
    for lst in lists:
        # Add values from the current list to the result dictionary
        for key, value in lst:
            result[key] = result.get(key, 0) + value

    return result


# multiplies the values of the 2 dictionaries with the same keys
def multiply_dictionary_values(dictionary, multiplier):
    result = {}

    for key, value in dictionary.items():
        result[key] = value * multiplier

    return result

# divides the values of the 2 dictionaries with the same keys
def divide_dictionary_values(dictionary, divider):
    result = {}

    # divides the values and update the dictionary
    for key, value in dictionary.items():
        result[key] = value / divider

    return result


# sums the values of the 2 dictionaries with the same keys
def sum_dicts(dict1, dict2):
    sum_dict = {}

    for key in dict1:
        if key in dict2:
            sum_dict[key] = dict1[key] + dict2[key]

    return sum_dict

# ranks list of profiles based on single requirement
def rank_profiles(profiles, requirement,weight_value):
    
    # Step 1: Preprocessing
    vectorizer = TfidfVectorizer()
    profile_vectors = vectorizer.fit_transform(profiles)
    # desc_vector = vectorizer.transform([description])
    req_vector = vectorizer.transform([requirement])

    # Step 2: Calculate Cosine Similarity for each profile
    # cosine_sims = cosine_similarity(desc_vector, profile_vectors)
    req_sims = cosine_similarity(req_vector, profile_vectors)
    # weighted_sims = cosine_sims * req_sims

    # Step 3: Rank the Profiles
    rankings = sorted(enumerate(req_sims[0]), key=lambda x: x[1], reverse=True)   
    rankings = [(key, value * float(weight_value)) for key, value in rankings]
    return rankings

# weighting requirement and job description differently
def weight_features(description_weight,requirement_weight,description_ranking,requirements_rankings,requirements):
    # nuterlizing sum of 8 different requiremnet ranking to level down to sigle describtion 

    requirement_ranking_neutral_weight = divide_dictionary_values(requirements_rankings,len(requirements))

    # weighting the resume based on weights value 
    weighted_description_ranking = multiply_dictionary_values(description_ranking, description_weight)
    weighted_requirement_ranking = multiply_dictionary_values(requirement_ranking_neutral_weight, requirement_weight)

    # returing the sum of multiplied weigths 
    return sum_dicts(weighted_requirement_ranking,weighted_description_ranking)

## **Running Application**

In [2]:
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

dir_location = r"./data"

#reads resume cv as dataframe
resumes = pd.read_csv(f"{dir_location}/resumes.csv")

# replace 'Belangrijkste vaardigheden' to 'Top Skills' from string
resumes[" resume_text"] = list(resumes[" resume_text"].replace('Belangrijkste vaardigheden', 'Top Skills', regex=True).values)

# list of filtered/clean resume, remove repeated A's (censored info.), replacing multiple space to single space.
profiles = []
for indx, row in resumes.iterrows():
    profiles.append(re.sub(r"\s+", " ",remove_repeated_letters(resumes[" resume_text"].values[indx])))

# assign the list of processed text to a column 'processed_text'
resumes["processed_text"] = profiles

# reading the job description from json file
description_df = pd.read_json(f"{dir_location}/job_description_response.json")

# creating the list of the requirement for the jobs
requirements = []
for index, row in description_df.iterrows():
    # Apply a lambda function to iterate over each value in the row
    row_values = row.apply(lambda x: x if isinstance(x, list) else [])
    
    # Iterate over the row values until you reach elements of a list
    for value in row_values:
        if isinstance(value, list):
            for element in value:
                requirements.append(element["title"])
                #print(f'List element: {element["title"]}')

        else:
            print(f'Non-list value: {value}')

# listing the requirement weight from json
requirement_weights = []
for value in row_values:
    if isinstance(value, list):
        for element in value:
            requirement_weights.append(element["weigth"])
            #print(f'List element: {element["weigth"]}')
    else:
        print(f'Non-list value: {value}')

# lable the string weights to intiger
label_mapping = {'Nice to have': 1, 'Should have': 2, 'Must have': 3}
requirement_weights = [label_mapping[label] for label in requirement_weights]

# cleaing the description text
description = remove_repeated_letters(description_df["description"].values[0])

# list the rankings for individual requirements
requirements_rankings = []
for indx,requirement in enumerate(requirements):
    requirements_rankings.append(rank_profiles(profiles, requirement,requirement_weights[indx]))

# suming up all the cosin similarity values bases on the same/single profile
requirements_rankings = sum_tuples(requirements_rankings)

# sorting out profiles scores based on hight to low
requirements_rankings = dict(sorted(requirements_rankings.items(), key=lambda item: item[1],reverse=True))

# ranking the profile based on the description
description_ranking = []
description_ranking.append(rank_profiles(profiles,description,1))

# suming up all the cosin similarity values bases on the same/single profile
description_ranking = sum_tuples(description_ranking)

# implementing the weighting function
main_rankings = weight_features(0.2,0.8,description_ranking,requirements_rankings,requirements)

# sorting the values for ranking
main_rankings = dict(sorted(main_rankings.items(), key=lambda x: x[1],reverse=True))

# Visualizing the dinformation
new_keys = []
for key,value in main_rankings.items():
    #print("key: ",resumes["id"].values[key],"   ","value: ",value)
    new_keys.append(resumes["id"].values[key])

# number to applicant name
main_rankings = {new_keys[i]: value for i, value in enumerate(main_rankings.values())}# create data frame

# creating a CSV files
pd.DataFrame(main_rankings.items(), columns=["applicant","value"]).to_csv('result_ranking.csv')

## **Analysis**

In [3]:
# check the keyword/(' sap ') if it present in the resume, ignore case sensitivity  'flags=re.I'
mask2= resumes[" resume_text"].str.contains(" sap ", case=False,regex=True,flags=re.I)
resumes[mask2]

# patten analysis: we find job title after keyword/'Top Skills'
mask = resumes[" resume_text"].str.contains("Top Skills", case=False)
resumes[mask]

Unnamed: 0,id,resume_text,processed_text
0,socrates,Contact AAAAAAAAAAAAAA 46 AAAAAAAAAAAAAA AAAA...,Contact 46 Mobile LinkedIn Top Skills Busines...
1,pythagoras,Contact AAAAAAAAAAAAAAA Mobile AAAAAAAAAAAAAA...,Contact Mobile julliescom Company Top Skills ...
2,heraclitus,Contactgegevens AAAAAAAAAAAAAAAAAAAA wwwlinke...,Contactgegevens wwwlinkedincomin LinkedIn Top...
3,homer,Contact wwwlinkedincomin AAAAAAAAAAAAAAAAA AA...,Contact wwwlinkedincomin Top Skills Business ...
5,hesiod,Contact wwwlinkedincomin AAAAAAAA AAAAAAAAAAA...,Contact wwwlinkedincomin Company Top Skills F...
6,theodorus,Contact wwwlinkedincomin AAAAAAAAA AAAAAAAA T...,Contact wwwlinkedincomin Top Skills UML Busin...
9,elea,Contact AAAAAAAAAAA Home AAAAAAAAA wwwlinkedi...,Contact Home wwwlinkedincomin LinkedIn Top Sk...
10,protagoras,Contact AAAAAAAAAAAAAAAAAAAA wwwlinkedincomin...,Contact wwwlinkedincomin Top Skills Project M...
11,thomas_baldwin,Contact AAAAAAAAAAA Home AAAAAAAAAAAAAAAAAAA ...,Contact Home wwwlinkedincomin Top Skills Serv...
12,alexander_bain,Contact AAAAAAAAAAAAAAAAAAAAAA wwwlinkedincom...,Contact wwwlinkedincomin LinkedIn Personal Co...
