## We are doing similarity matching against an input job description with each document classified in a cluster using cosine similarity. The document with the highest similarity is considered the closest match in the cluster

### Import python libraries required for processing

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
pd.set_option('display.max_colwidth', 100)

### Job Description - will start with simple text

In [3]:
job_description = "Software engineer with skills like java, spring"

### Resumes - will start with simple texts

In [4]:
resumes = ["This software engineer resume contains java, spring, hibernate",
             "This senior software engineer resume contains java, spring, spring boot",
             "This is a business analyst resume. jira, ppt, excel",
             "This is a business analyst. skill like trading, wealth management",
          "This software engineer resume contains python, pandas, data analysis",
             "This senior software engineer resume contains python, seaborn, NLP"]

In [5]:
df = pd.DataFrame({'Text': resumes})

### Data cleansing - removal of stop words# TF-IDF vectorization

In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Text'])

### Apply K-means clustering on the resumes - currently on sample texts

In [7]:
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)
df

  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,Text,Cluster
0,"This software engineer resume contains java, spring, hibernate",2
1,"This senior software engineer resume contains java, spring, spring boot",2
2,"This is a business analyst resume. jira, ppt, excel",0
3,"This is a business analyst. skill like trading, wealth management",0
4,"This software engineer resume contains python, pandas, data analysis",1
5,"This senior software engineer resume contains python, seaborn, NLP",1


### TF-IDF vectorization for the input text

In [8]:
input_vector = vectorizer.transform([job_description])

### Calculate cosine similarity with each document in the cluster

In [9]:
df['Similarity'] = df['Text'].apply(lambda x: cosine_similarity(vectorizer.transform([x]), input_vector).item())
df

Unnamed: 0,Text,Cluster,Similarity
0,"This software engineer resume contains java, spring, hibernate",2,0.613055
1,"This senior software engineer resume contains java, spring, spring boot",2,0.618316
2,"This is a business analyst resume. jira, ppt, excel",0,0.0
3,"This is a business analyst. skill like trading, wealth management",0,0.227367
4,"This software engineer resume contains python, pandas, data analysis",1,0.180457
5,"This senior software engineer resume contains python, seaborn, NLP",1,0.186688


### Display close matches in the same cluster

In [10]:
cluster_matches = df[df['Cluster'] == df.loc[df['Similarity'].idxmax(), 'Cluster']]
print("Close matches in the cluster:")
print(cluster_matches[['Text', 'Similarity']])

Close matches in the cluster:
                                                                      Text  \
0           This software engineer resume contains java, spring, hibernate   
1  This senior software engineer resume contains java, spring, spring boot   

   Similarity  
0    0.613055  
1    0.618316  
