## We are doing similarity matching against an input job description with each document classified in a cluster using cosine similarity. The document with the highest similarity is considered the closest match in the cluster

### Import python libraries required for processing

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

In [2]:
inputPath = "https://console.cloud.google.com/storage/browser/hackathon1415"
bucket_name = inputPath.split("/")[-1]
bucket_name

'hackathon1415'

In [3]:
# program to read the extracted data and process
excel_file = bucket_name + '_resume_data.xlsx'
df = pd.read_excel(excel_file)
df = df.dropna()
df.head()

Unnamed: 0,name,skills
0,RESUME/data/10247517.pdf,"Analysis, Schedules, Requests, Programming, Wi..."
1,RESUME/data/10265057.pdf,"Requests, Programming, Python, Reports, Data c..."
2,RESUME/data/10399912.pdf,"Schedules, Correspondence, Reports, Customer s..."
3,RESUME/data/10549585.pdf,"Analysis, Retention, Windows, Analytical, Sche..."
4,RESUME/data/10553553.pdf,"Analysis, Windows, Security, Servers, Database..."


### Job Description - will start with simple text

In [4]:
job_description = 'Analysis, Programming, Windows, Quality assurance, Telecom, Security, Servers, Database, Design, Troubleshooting, Erp, System, Sap, Solidworks, Communication, Lan, Performance metrics, Sql, Microsoft sql, Hardware, Visio, Compliance, Contracts, Policies, Budgeting'

### Data cleansing - removal of stop words# TF-IDF vectorization

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['skills'])

### Apply K-means clustering on the resumes - currently on sample texts

In [6]:
num_clusters = 30
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)
df

Unnamed: 0,name,skills,Cluster
0,RESUME/data/10247517.pdf,"Analysis, Schedules, Requests, Programming, Wi...",2
1,RESUME/data/10265057.pdf,"Requests, Programming, Python, Reports, Data c...",7
2,RESUME/data/10399912.pdf,"Schedules, Correspondence, Reports, Customer s...",1
3,RESUME/data/10549585.pdf,"Analysis, Retention, Windows, Analytical, Sche...",0
4,RESUME/data/10553553.pdf,"Analysis, Windows, Security, Servers, Database...",2
...,...,...,...
195,RESUME/data/13477922.pdf,"Analysis, Windows, Biology, Teaching, Security...",2
196,RESUME/data/13491889.pdf,"Analysis, Microsoft office suite, English, Ana...",6
197,RESUME/data/13503650.pdf,"Analysis, Requests, Correspondence, Reports, C...",27
198,RESUME/data/13518263.pdf,"Construction, Presentations, Schedules, Proces...",3


### TF-IDF vectorization for the input text

In [7]:
input_vector = vectorizer.transform([job_description])

### Calculate cosine similarity with each document in the cluster

In [8]:
df['Similarity'] = df['skills'].apply(lambda x: cosine_similarity(vectorizer.transform([x]), input_vector).item())
df

Unnamed: 0,name,skills,Cluster,Similarity
0,RESUME/data/10247517.pdf,"Analysis, Schedules, Requests, Programming, Wi...",2,0.728811
1,RESUME/data/10265057.pdf,"Requests, Programming, Python, Reports, Data c...",7,0.160637
2,RESUME/data/10399912.pdf,"Schedules, Correspondence, Reports, Customer s...",1,0.032828
3,RESUME/data/10549585.pdf,"Analysis, Retention, Windows, Analytical, Sche...",0,0.196745
4,RESUME/data/10553553.pdf,"Analysis, Windows, Security, Servers, Database...",2,0.301408
...,...,...,...,...
195,RESUME/data/13477922.pdf,"Analysis, Windows, Biology, Teaching, Security...",2,0.161145
196,RESUME/data/13491889.pdf,"Analysis, Microsoft office suite, English, Ana...",6,0.064340
197,RESUME/data/13503650.pdf,"Analysis, Requests, Correspondence, Reports, C...",27,0.062434
198,RESUME/data/13518263.pdf,"Construction, Presentations, Schedules, Proces...",3,0.028068


### Display close matches in the same cluster

In [9]:
cluster_matches = df[df['Cluster'] == df.loc[df['Similarity'].idxmax(), 'Cluster']]
cluster_matches

Unnamed: 0,name,skills,Cluster,Similarity
0,RESUME/data/10247517.pdf,"Analysis, Schedules, Requests, Programming, Wi...",2,0.728811
4,RESUME/data/10553553.pdf,"Analysis, Windows, Security, Servers, Database...",2,0.301408
35,RESUME/data/11580408.pdf,"Windows, Reports, Security, Servers, Database,...",2,0.349903
101,RESUME/data/12374933.pdf,"Analysis, Scripting, Windows, Requests, Progra...",2,0.271212
105,RESUME/data/12415691.pdf,"Visual, Scripting, Windows, Python, Reports, S...",2,0.211617
193,RESUME/data/13418452.pdf,"Analysis, Retention, Mis, Windows, Scripting, ...",2,0.161289
195,RESUME/data/13477922.pdf,"Analysis, Windows, Biology, Teaching, Security...",2,0.161145


In [10]:
print("Close matches in the cluster:")
cluster_matches = cluster_matches[cluster_matches['Similarity'] > 0]
cluster_matches = cluster_matches.sort_values(by='Similarity', ascending=False)
cluster_matches

Close matches in the cluster:


Unnamed: 0,name,skills,Cluster,Similarity
0,RESUME/data/10247517.pdf,"Analysis, Schedules, Requests, Programming, Wi...",2,0.728811
35,RESUME/data/11580408.pdf,"Windows, Reports, Security, Servers, Database,...",2,0.349903
4,RESUME/data/10553553.pdf,"Analysis, Windows, Security, Servers, Database...",2,0.301408
101,RESUME/data/12374933.pdf,"Analysis, Scripting, Windows, Requests, Progra...",2,0.271212
105,RESUME/data/12415691.pdf,"Visual, Scripting, Windows, Python, Reports, S...",2,0.211617
193,RESUME/data/13418452.pdf,"Analysis, Retention, Mis, Windows, Scripting, ...",2,0.161289
195,RESUME/data/13477922.pdf,"Analysis, Windows, Biology, Teaching, Security...",2,0.161145


### Normalize Similarity Scores

In [15]:

normalized_scores = (df['Similarity'] - df['Similarity'].min()) / (df['Similarity'].max() - df['Similarity'].min())
normalized_scores

0      1.000000
1      0.220409
2      0.045044
3      0.269953
4      0.413562
         ...   
195    0.221107
196    0.088281
197    0.085666
198    0.038513
199    0.032450
Name: Similarity, Length: 199, dtype: float64

In [25]:
# Step 4: Thresholding
threshold = 0.4

In [22]:
# Step 5: Confidence Score Calculation
df['confidence'] = np.array(normalized_scores).flatten()
df

Unnamed: 0,name,skills,Cluster,Similarity,confidence
0,RESUME/data/10247517.pdf,"Analysis, Schedules, Requests, Programming, Wi...",2,0.728811,1.000000
1,RESUME/data/10265057.pdf,"Requests, Programming, Python, Reports, Data c...",7,0.160637,0.220409
2,RESUME/data/10399912.pdf,"Schedules, Correspondence, Reports, Customer s...",1,0.032828,0.045044
3,RESUME/data/10549585.pdf,"Analysis, Retention, Windows, Analytical, Sche...",0,0.196745,0.269953
4,RESUME/data/10553553.pdf,"Analysis, Windows, Security, Servers, Database...",2,0.301408,0.413562
...,...,...,...,...,...
195,RESUME/data/13477922.pdf,"Analysis, Windows, Biology, Teaching, Security...",2,0.161145,0.221107
196,RESUME/data/13491889.pdf,"Analysis, Microsoft office suite, English, Ana...",6,0.064340,0.088281
197,RESUME/data/13503650.pdf,"Analysis, Requests, Correspondence, Reports, C...",27,0.062434,0.085666
198,RESUME/data/13518263.pdf,"Construction, Presentations, Schedules, Proces...",3,0.028068,0.038513


In [26]:
# Step 6: Confidence Score Calculation
df_with_confidence = df[df['confidence']>=threshold]
df_with_confidence

Unnamed: 0,name,skills,Cluster,Similarity,confidence
0,RESUME/data/10247517.pdf,"Analysis, Schedules, Requests, Programming, Wi...",2,0.728811,1.0
4,RESUME/data/10553553.pdf,"Analysis, Windows, Security, Servers, Database...",2,0.301408,0.413562
35,RESUME/data/11580408.pdf,"Windows, Reports, Security, Servers, Database,...",2,0.349903,0.480101


In [27]:
# Step 7: Ranking
df_with_confidence.sort_values(by='confidence', ascending=False)

Unnamed: 0,name,skills,Cluster,Similarity,confidence
0,RESUME/data/10247517.pdf,"Analysis, Schedules, Requests, Programming, Wi...",2,0.728811,1.0
35,RESUME/data/11580408.pdf,"Windows, Reports, Security, Servers, Database,...",2,0.349903,0.480101
4,RESUME/data/10553553.pdf,"Analysis, Windows, Security, Servers, Database...",2,0.301408,0.413562
