In [1]:
#import library and module
import pandas as pd
from resume_screening import resparser, match



In [2]:
from nltk.corpus import stopwords
stopw  = set(stopwords.words('english'))

In [3]:
#read indeed job vacancy data
job = pd.read_csv('indeed_data.csv')
job['test'] = job['description'].apply(lambda x: ' '.join([word for word in str(x).split() if len(word)>2 and word not in (stopw)]))

In [4]:
df = job.drop_duplicates(subset='test').reset_index(drop=True)
df.head()

Unnamed: 0,title,company,salary,description,link,test
0,Data Scientist (Analytics),Grab,,"Job Description: Life at Grab At Grab, every G...",https://id.indeed.com/rc/clk?jk=acd6479660d3fc...,"Job Description: Life Grab Grab, every Grabber..."
1,Data Scientist,Sayurbox,,Job Description Be a part of Sayurbox data tea...,https://id.indeed.com/rc/clk?jk=40482050a5e4d8...,Job Description part Sayurbox data team target...
2,Senior Data Scientist - GoPay,GO-JEK,,Location Jakarta Work Type Permanent Applicati...,https://id.indeed.com/rc/clk?jk=60463a42bb19a6...,Location Jakarta Work Type Permanent Applicati...
3,Data Scientist,Allianz Indonesia,,Bachelor Degree with field of study mathematic...,https://id.indeed.com/rc/clk?jk=835f2c980867fe...,"Bachelor Degree field study mathematic, inform..."
4,Senior Data Scientist,Sayurbox,,Job Description: Build Sayurbox data team acco...,https://id.indeed.com/rc/clk?jk=da6afcc2ee471d...,Job Description: Build Sayurbox data team acco...


In [5]:
test = (df['test'].values.astype('U'))

In [6]:
print("There are {} job vacancy list scrapped from id.indeed.com".format(df.shape[0]))

There are 567 job vacancy list scrapped from id.indeed.com


# Input CV & Preprocessing

In [186]:
resumeList = []
similarityPercentage =[]

In [295]:
#input the resume or CV file
filled = input()
resumeList.append(filled)
skills = resparser.skill("instance/resume_files/{}".format(filled))

In [296]:
skills.append(match.preprocessing(skills[0]))
del skills[0]

print("SKILLS LIST:\n",skills)

SKILLS LIST:
 ['figma instagram organization investors feeds usability testing higher education banners mastery moc teaching social media user interface design space linkedin export google interfaces app wireframing internships ux online training mobile interview building materials ux design prototype design media design machine learning options promotional manuals ug content design dsc usability web ued mobile applications user experience design learning visual design java experience design illustration gsm web design developers sketching user experience digital graphic design testing']


# Model KNN - Minkowski Metric

In [297]:
#Train the KNN model using Minkowski Metric
matches = match.knearestNeighbors(skills, test)
matches = pd.DataFrame(matches, columns=['Match confidence'])

In [298]:
#Job Vacancy Recommendation
result_minkowski = df[['title','company','salary']].copy()
result_minkowski['match'] = matches['Match confidence'].copy()
result_minkowski = result_minkowski.sort_values('match').reset_index(drop=True).head(20)
result_minkowski

Unnamed: 0,title,company,salary,match
0,UX Designer,PT HZN Teknologi Indonesia (Horizon),,0.54
1,UX/UI Designer (Ref. No. 2156),Global Landscapes Forum (CIFOR),,0.56
2,UX / UI Designer,Startups,,0.57
3,Senior UI/UX Designer,Tokenomy,,0.57
4,UI Designer - Internship,PT HZN Teknologi Indonesia (Horizon),,0.57
5,UI/ UX Designer,Tinkerlust,,0.57
6,Senior UI/UX Designer,Cermati.com,,0.6
7,UI / UX Designer,PT Mobile Sarana Sentosa,,0.61
8,Senior UI/UX Designer,Whello Indonesia,,0.62
9,UI/UX Designer (Senior),Pensieve,,0.62


# Model Cosine Similarity

In [299]:
df['clean'] = df['test'].apply(match.preprocessing)
jobdesc = (df['clean'].values.astype('U'))

In [300]:
count_matrix = match.vectorizing(skills[0], jobdesc)
matchPercentage = match.coSim(count_matrix)
matchPercentage = pd.DataFrame(matchPercentage, columns=['Skills Match'])

In [301]:
#Job Vacancy Recommendations
result_cosine = df[['title','company','salary']]
result_cosine = result_cosine.join(matchPercentage)
result_cosine = result_cosine.sort_values('Skills Match', ascending=False).reset_index(drop=True).head(20)
result_cosine

Unnamed: 0,title,company,salary,Skills Match
0,UI Designer - Internship,PT HZN Teknologi Indonesia (Horizon),,68.5
1,UX Designer,PT HZN Teknologi Indonesia (Horizon),,68.12
2,Senior UI/UX Designer,Whello Indonesia,,56.29
3,Senior UI/UX Designer,"DOKU, PT NUSA SATU INTI ARTHA",,55.31
4,UI / UX Designer,PT Mobile Sarana Sentosa,,54.49
5,UI/UX DESIGNER,Dipstrategy,,54.38
6,UI/ UX Designer,Tinkerlust,,52.51
7,Senior UI/UX Designer,PT Nusa Satu Inti Artha,,52.49
8,UI/UX Designer,Dekoruma,,50.85
9,UI/UX Designer,Nomura Research Institute Indonesia,,48.13


# Comparing between Minkowski Metric vs Cosine Similarity

In [302]:
data = [result_minkowski[['title','company']], result_cosine[['title','company']]]
headers = ['Minkowski','Cosine Similarity']
df_compare = pd.concat(data, axis = 1, keys = headers)
df_compare

Unnamed: 0_level_0,Minkowski,Minkowski,Cosine Similarity,Cosine Similarity
Unnamed: 0_level_1,title,company,title,company
0,UX Designer,PT HZN Teknologi Indonesia (Horizon),UI Designer - Internship,PT HZN Teknologi Indonesia (Horizon)
1,UX/UI Designer (Ref. No. 2156),Global Landscapes Forum (CIFOR),UX Designer,PT HZN Teknologi Indonesia (Horizon)
2,UX / UI Designer,Startups,Senior UI/UX Designer,Whello Indonesia
3,Senior UI/UX Designer,Tokenomy,Senior UI/UX Designer,"DOKU, PT NUSA SATU INTI ARTHA"
4,UI Designer - Internship,PT HZN Teknologi Indonesia (Horizon),UI / UX Designer,PT Mobile Sarana Sentosa
5,UI/ UX Designer,Tinkerlust,UI/UX DESIGNER,Dipstrategy
6,Senior UI/UX Designer,Cermati.com,UI/ UX Designer,Tinkerlust
7,UI / UX Designer,PT Mobile Sarana Sentosa,Senior UI/UX Designer,PT Nusa Satu Inti Artha
8,Senior UI/UX Designer,Whello Indonesia,UI/UX Designer,Dekoruma
9,UI/UX Designer (Senior),Pensieve,UI/UX Designer,Nomura Research Institute Indonesia


In [303]:
job_compare = pd.merge(df_compare['Minkowski'], df_compare['Cosine Similarity'], how = 'inner')
job_compare

Unnamed: 0,title,company
0,UX Designer,PT HZN Teknologi Indonesia (Horizon)
1,UX / UI Designer,Startups
2,Senior UI/UX Designer,Tokenomy
3,UI Designer - Internship,PT HZN Teknologi Indonesia (Horizon)
4,UI/ UX Designer,Tinkerlust
5,UI / UX Designer,PT Mobile Sarana Sentosa
6,Senior UI/UX Designer,Whello Indonesia
7,UI/UX Designer,Mekari (PT. Mid Solusi Nusantara)
8,Senior UI/UX Designer,PT Nusa Satu Inti Artha
9,UI/UX Designer,1001malam.com


In [304]:
similarityPercentage.append((job_compare.shape[0]/df_compare.shape[0])*100)

In [305]:
print(resumeList)
print(similarityPercentage)

['Arbi Dwi.pdf', 'Dwiky Aprian.pdf', 'Fadjrian Gibran.pdf', 'Faisal_Rizki.pdf', 'Ferdian_Maulana.pdf', 'Hanif Khoirul.pdf', 'Ilyas Adiyasa.pdf', 'Rangga Syahrial.pdf', 'Triardy_S.pdf', 'uiux.pdf']
[30.0, 15.0, 40.0, 25.0, 65.0, 55.00000000000001, 10.0, 45.0, 40.0, 70.0]


In [306]:
modelSimilarity = pd.DataFrame(list(zip(resumeList, similarityPercentage)),columns =['Resume Name', 'Similarity'])
print(modelSimilarity)

           Resume Name  Similarity
0         Arbi Dwi.pdf        30.0
1     Dwiky Aprian.pdf        15.0
2  Fadjrian Gibran.pdf        40.0
3     Faisal_Rizki.pdf        25.0
4  Ferdian_Maulana.pdf        65.0
5    Hanif Khoirul.pdf        55.0
6    Ilyas Adiyasa.pdf        10.0
7  Rangga Syahrial.pdf        45.0
8        Triardy_S.pdf        40.0
9             uiux.pdf        70.0


In [307]:
print("Rata-rata similarity antara 2 metric diatas adalah: ", modelSimilarity["Similarity"].mean())

Rata-rata similarity antara 2 metric diatas adalah:  39.5
