In [1]:
#import library and module
import pandas as pd
from resume_screening import resparser, match



In [2]:
from nltk.corpus import stopwords
stopw  = set(stopwords.words('english'))

In [3]:
#read indeed job vacancy data
job = pd.read_csv('indeed_data.csv')
job['test'] = job['description'].apply(lambda x: ' '.join([word for word in str(x).split() if len(word)>2 and word not in (stopw)]))

In [4]:
df = job.drop_duplicates(subset='test').reset_index(drop=True)
df.head()

Unnamed: 0,title,company,salary,description,link,test
0,Data Scientist (Analytics),Grab,,"Job Description: Life at Grab At Grab, every G...",https://id.indeed.com/rc/clk?jk=acd6479660d3fc...,"Job Description: Life Grab Grab, every Grabber..."
1,Data Scientist,Sayurbox,,Job Description Be a part of Sayurbox data tea...,https://id.indeed.com/rc/clk?jk=40482050a5e4d8...,Job Description part Sayurbox data team target...
2,Senior Data Scientist - GoPay,GO-JEK,,Location Jakarta Work Type Permanent Applicati...,https://id.indeed.com/rc/clk?jk=60463a42bb19a6...,Location Jakarta Work Type Permanent Applicati...
3,Data Scientist,Allianz Indonesia,,Bachelor Degree with field of study mathematic...,https://id.indeed.com/rc/clk?jk=835f2c980867fe...,"Bachelor Degree field study mathematic, inform..."
4,Senior Data Scientist,Sayurbox,,Job Description: Build Sayurbox data team acco...,https://id.indeed.com/rc/clk?jk=da6afcc2ee471d...,Job Description: Build Sayurbox data team acco...


In [5]:
test = (df['test'].values.astype('U'))

In [6]:
print("There are {} job vacancy list scrapped from id.indeed.com".format(df.shape[0]))

There are 567 job vacancy list scrapped from id.indeed.com


In [7]:
#input the resume or CV file
filled = input()
skills = resparser.skill("resume_files/{}".format(filled))

In [8]:
skills.append(match.preprocessing(skills[0]))
del skills[0]

print("SKILLS LIST:\n",skills)

SKILLS LIST:
 ['online machine learning mobile applications prototype figma user experience design ued building banners user experience interview ug dsc content design interfaces wireframing investors design organization social media mastery web design internships gsm java ux design mobile google learning export higher education teaching experience design promotional feeds instagram options testing sketching media design training graphic design visual design space app linkedin moc digital usability testing developers materials user interface design ux illustration usability web manuals']


# Model KNN - Minkowski Metric

In [11]:
#Train the KNN model using Minkowski Metric
matches = match.knearestNeighbors(skills, test)
matches = pd.DataFrame(matches, columns=['Match confidence'])

In [17]:
#Job Vacancy Recommendation
result_minkowski = df[['title','company','salary']].copy()
result_minkowski['match'] = matches['Match confidence'].copy()
result_minkowski = result_minkowski.sort_values('match').reset_index(drop=True).head(10)
result_minkowski

Unnamed: 0,title,company,salary,match
0,Oracle DBA Consultant,NTT Ltd,,0.72
1,DATABASE ADMINISTRATOR – Entertainment Platfor...,Parrish & Co.,,0.73
2,Database Administrator,Mekari (PT. Mid Solusi Nusantara),,0.74
3,Database Administrator Officer - Finance Services,RGF HR Agent,,0.75
4,Database Administrator (for AMAAN Project),DKatalis,,0.75
5,Database Administrator Officer,Reeracoen Indonesia,,0.76
6,Oracle Database Administrator,PT. Berca Hardayaperkasa,,0.77
7,Business Intelligence (Junior/Senior/Lead),Cermati.com,,0.77
8,"Data Engineer, Commercial & Operation Team",ZALORA SOUTH EAST ASIA PTE LTD,,0.77
9,System Administrator,Walden Global Services - WGS,,0.77


# Model Cosine Similarity

In [13]:
df['clean'] = df['test'].apply(match.preprocessing)
jobdesc = (df['clean'].values.astype('U'))

In [14]:
count_matrix = match.vectorizing(skills[0], jobdesc)
matchPercentage = match.coSim(count_matrix)
matchPercentage = pd.DataFrame(matchPercentage, columns=['Skills Match'])

In [18]:
#Job Vacancy Recommendations
result_cosine = df[['title','company','salary']]
result_cosine = result_cosine.join(matchPercentage)
result_cosine = result_cosine.sort_values('Skills Match', ascending=False).reset_index(drop=True).head(10)
result_cosine

Unnamed: 0,title,company,salary,Skills Match
0,IT Database Administrator,PT Anabatic Digital Raya,,32.57
1,CISO12 - Data Protection Officer,PT Bank Mandiri (Persero) Tbk.,,31.25
2,Senior Database Administrator,Syscon Justice System,,31.25
3,DBA Junior Level,Krakatau Information Technology,,30.71
4,Database Administrator,PT. PRIMA VISTA SOLUSI,,30.62
5,Oracle Database Administrator,NTT INDONESIA TECHNOLOGY,,30.17
6,Business Intelligence (Tableau),NTT INDONESIA TECHNOLOGY,,28.49
7,Oracle Database Administrator,PT. Berca Hardayaperkasa,,28.18
8,CISO16 - Data Encryption & Key Management Officer,PT Bank Mandiri (Persero) Tbk.,,28.17
9,DATABASE ADMINISTRATOR – Entertainment Platfor...,Parrish & Co.,,27.54


# Comparing between Minkowski Metric vs Cosine Similarity

In [19]:
data = [result_minkowski[['title','company']], result_cosine[['title','company']]]
headers = ['Minkowski','Cosine Similarity']
df_compare = pd.concat(data, axis = 1, keys = headers)
df_compare

Unnamed: 0_level_0,Minkowski,Minkowski,Cosine Similarity,Cosine Similarity
Unnamed: 0_level_1,title,company,title,company
0,Oracle DBA Consultant,NTT Ltd,IT Database Administrator,PT Anabatic Digital Raya
1,DATABASE ADMINISTRATOR – Entertainment Platfor...,Parrish & Co.,CISO12 - Data Protection Officer,PT Bank Mandiri (Persero) Tbk.
2,Database Administrator,Mekari (PT. Mid Solusi Nusantara),Senior Database Administrator,Syscon Justice System
3,Database Administrator Officer - Finance Services,RGF HR Agent,DBA Junior Level,Krakatau Information Technology
4,Database Administrator (for AMAAN Project),DKatalis,Database Administrator,PT. PRIMA VISTA SOLUSI
5,Database Administrator Officer,Reeracoen Indonesia,Oracle Database Administrator,NTT INDONESIA TECHNOLOGY
6,Oracle Database Administrator,PT. Berca Hardayaperkasa,Business Intelligence (Tableau),NTT INDONESIA TECHNOLOGY
7,Business Intelligence (Junior/Senior/Lead),Cermati.com,Oracle Database Administrator,PT. Berca Hardayaperkasa
8,"Data Engineer, Commercial & Operation Team",ZALORA SOUTH EAST ASIA PTE LTD,CISO16 - Data Encryption & Key Management Officer,PT Bank Mandiri (Persero) Tbk.
9,System Administrator,Walden Global Services - WGS,DATABASE ADMINISTRATOR – Entertainment Platfor...,Parrish & Co.


In [20]:
job_compare = pd.merge(df_compare['Minkowski'], df_compare['Cosine Similarity'], how = 'inner')
job_compare

Unnamed: 0,title,company
0,DATABASE ADMINISTRATOR – Entertainment Platfor...,Parrish & Co.
1,Oracle Database Administrator,PT. Berca Hardayaperkasa


In [21]:
similarityPercentage = (job_compare.shape[0]/df_compare.shape[0])*100
print("Similarity between Minkowski Model and Cosine Similarity Model is {} %".format(similarityPercentage))

Similarity between Minkowski Model and Cosine Similarity Model is 20.0 %
