In [2]:
import pandas as pd

resume_df = pd.read_csv(r'Resume and job_description/Resume.csv')
job_desc_df = pd.read_csv(r'Resume and job_description/training_data.csv')


In [3]:
print(resume_df.head())    # Check first 5 rows of Resume.csv
print(job_desc_df.head())  # Check first 5 rows of training_data.csv


         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  
  company_name                                    job_description  \
0       Google  minimum qualifications\nbachelors degree or eq...   
1        Apple  description\nas an asc you will be highly infl...   
2      Netfl

In [4]:
import re
import string

# Function to clean the text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters (punctuation)
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.lower().strip()  # Convert to lowercase and remove leading/trailing spaces

# Apply the cleaning function to both the resumes and job descriptions
resume_df['cleaned_resume'] = resume_df['Resume_str'].apply(clean_text)
job_desc_df['cleaned_job_desc'] = job_desc_df['job_description'].apply(clean_text)

# Check the first few rows of the cleaned text
print(resume_df[['Resume_str', 'cleaned_resume']].head())
print(job_desc_df[['job_description', 'cleaned_job_desc']].head())


                                          Resume_str  \
0           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1           HR SPECIALIST, US HR OPERATIONS      ...   
2           HR DIRECTOR       Summary      Over 2...   
3           HR SPECIALIST       Summary    Dedica...   
4           HR MANAGER         Skill Highlights  ...   

                                      cleaned_resume  
0  hr administratormarketing associate hr adminis...  
1  hr specialist us hr operations summary versati...  
2  hr director summary over years experience in r...  
3  hr specialist summary dedicated driven and dyn...  
4  hr manager skill highlights hr skills hr depar...  
                                     job_description  \
0  minimum qualifications\nbachelors degree or eq...   
1  description\nas an asc you will be highly infl...   
2  its an amazing time to be joining netflix as w...   
3  description\n\nweb designers looking to expand...   
4  at trackfive weve got big goals were on a miss... 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the resumes and job descriptions
resume_tfidf = vectorizer.fit_transform(resume_df['cleaned_resume'])
job_desc_tfidf = vectorizer.transform(job_desc_df['cleaned_job_desc'])

# Check the dimensions (how many features were extracted)
print(resume_tfidf.shape)
print(job_desc_tfidf.shape)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity between job descriptions and resumes
cosine_sim = cosine_similarity(job_desc_tfidf, resume_tfidf)

# Let's check the similarity between the first job description and all resumes
print(cosine_sim[0])  # Similarity scores for the first job description with all resumes


[0.03840634 0.05253762 0.03410584 ... 0.01971533 0.02443052 0.03486078]


In [None]:
import numpy as np

# Get the top 5 resumes for the first job description
top_resumes = np.argsort(cosine_sim[0])[-5:][::-1]  # Top 5 resumes for the first job description

# Print out the top 5 resumes and their similarity scores
for idx in top_resumes:
    print(f"Resume: {resume_df['Resume_str'].iloc[idx]}")
    print(f"Similarity Score: {cosine_sim[0][idx]}")
    print('-' * 50)


Resume:          PA MEDIA GROUP       Summary    Be in a position involving and utilizing my marketing and management skills and knowledge gained throughout my education and on the job experience.      Highlights        Deep understanding of Google Analytics; analyzing website traffic and trends to help make business decisions; experienced in the tracking and optimize advertising campaigns; Heavy experience in digital marketing (search, target ads, email, social, display, mobile); sold and managed numerous multi platform digital marketing plans; reading and interpreting digital reports; extensive print experience;  effective when leading a team;  organized; take direction well; open minded; customer service experience; sufficient in computer based skills; work well under pressure; event planning experience
*reference available upon request            Experience      Pa Media Group   04/2015   to   Current     Company Name      I am responsible for maintaining and growing the Real Estat

In [None]:
def get_top_matches(job_idx, top_n=5):
    """
    Returns a DataFrame of the top_n matching resumes for the job at index job_idx.
    """
    # Get similarity scores for this job
    scores = cosine_sim[job_idx]
    # Find top N resume indices
    top_idx = np.argsort(scores)[-top_n:][::-1]
    # Build result DataFrame
    results = pd.DataFrame({
        'Resume_ID': resume_df['ID'].iloc[top_idx].values,
        'Category': resume_df['Category'].iloc[top_idx].values,
        'Similarity': scores[top_idx]
    })
    results['Resume_Snippet'] = resume_df['cleaned_resume'].iloc[top_idx].str[:200].values
    return results

# Example: top 5 matches for the 2nd job description
print(get_top_matches(job_idx=1, top_n=5))


   Resume_ID              Category  Similarity  \
0   27139412  BUSINESS-DEVELOPMENT    0.137367   
1   13199813  BUSINESS-DEVELOPMENT    0.131715   
2   38896303            CONSULTANT    0.127800   
3   17189156               BANKING    0.114142   
4   48533663            CONSULTANT    0.110609   

                                      Resume_Snippet  
0  business development executive summary an achi...  
1  director of business development executive pro...  
2  consultant career overview macintosh expert wi...  
3  business banking specialist summary topperform...  
4  consultant executive profile visionary operati...  


In [None]:
all_matches = []

for idx in range(len(job_desc_df)):
    top5 = get_top_matches(job_idx=idx, top_n=5)
    top5.insert(0, 'Job_Index', idx)
    top5.insert(1, 'Job_Title', job_desc_df['position_title'].iat[idx])
    all_matches.append(top5)

# Concatenate into one DataFrame
matches_df = pd.concat(all_matches, ignore_index=True)

# Show the first few rows
matches_df.head(10)


Unnamed: 0,Job_Index,Job_Title,Resume_ID,Category,Similarity,Resume_Snippet
0,0,Sales Specialist,85421438,HEALTHCARE,0.204557,pa media group summary be in a position involv...
1,0,Sales Specialist,80503242,ADVOCATE,0.17152,manager digital marketing and communications o...
2,0,Sales Specialist,15535920,CONSULTANT,0.167245,business consultant professional summary it bu...
3,0,Sales Specialist,28679359,DIGITAL-MEDIA,0.161122,digital marketing manager summary years of exp...
4,0,Sales Specialist,41152404,BPO,0.146347,test analystinterncontractor profile years of ...
5,1,Apple Solutions Consultant,27139412,BUSINESS-DEVELOPMENT,0.137367,business development executive summary an achi...
6,1,Apple Solutions Consultant,13199813,BUSINESS-DEVELOPMENT,0.131715,director of business development executive pro...
7,1,Apple Solutions Consultant,38896303,CONSULTANT,0.1278,consultant career overview macintosh expert wi...
8,1,Apple Solutions Consultant,17189156,BANKING,0.114142,business banking specialist summary topperform...
9,1,Apple Solutions Consultant,48533663,CONSULTANT,0.110609,consultant executive profile visionary operati...


In [None]:
matches_df['Recruiter_Score'] = np.nan
matches_df['Recruiter_Notes'] = ''


In [None]:
matches_df.to_excel('matches_for_feedback.xlsx', index=False)


In [None]:
feedback_df = pd.read_excel('matches_for_feedback.xlsx')

# Update the recruiter scores
matches_df['Recruiter_Score'] = feedback_df['Recruiter_Score']
