## KY Dots Coding Challenge

### Part A : Import Required Libraries and Write User-Defined Functions

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from fuzzywuzzy import fuzz, process

In [3]:
def tokenize(text):
    '''There are many programming languages which end with a symbol. 
       Something like C++ or C#. 
       Our tokenizer function should be able to handle any of these'''
    words = re.findall(r'[^,]+', text)
    return(words)

In [4]:
def cosine_similarity(a, b):
    '''Finding the cosine similarity between matrix a and b'''
    similarity = pd.DataFrame(np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b)))
    return(similarity)

### Part B : Import Required Datasets

In [5]:
jobs = pd.read_excel('../data/raw/dataset_sample.xls', sheetname = 'Job_Pool', na_values = 'No Limit')
candidates = pd.read_excel('../data/raw/dataset_sample.xls', sheetname = 'CV_Pool', na_values = 'No Limit')
mapping = pd.read_excel('../data/raw/dataset_sample.xls', sheetname = 'Job_CV_Mapping', na_values = 'No Limit')

Lets see how jobs looks like : 

In [6]:
jobs

Unnamed: 0,Job_Id,Designation,Required Skills,Min Exp (in months),Max Exp (in months)
0,1,SDE Testing,"Java, J2EE, Sql, Javascript",12.0,36.0
1,2,SDE II Testing,"J2EE, Multithreading, Javascript, Regression T...",24.0,
2,3,AVP,"Team Leading, MVC, Design, Feasability Analysi...",60.0,
3,4,SDE,"C#, Sql Server, Javascript",0.0,
4,5,SDE II,"C#, Asp.Net, WPF, Framework 4",36.0,
5,6,Team Lead,"ASP.NET, Team Leading, Blueprint Creation, Res...",84.0,
6,7,Program Manager,"Presentation Skills, Leadership qualities, Tea...",120.0,
7,8,CTO,"Leadership Qualities, Entrepreneurial Spirit",,
8,9,Data Scientist,"Machine Learning, Java, Python",0.0,
9,10,Lead Data Scientist,"Machine Learning, Deep Learning, NLP, RNN, LST...",36.0,


Lets see how candidates loooks like : 

In [7]:
candidates

Unnamed: 0,CV_Id,Skills,Designation,Industry,Experience (in months),Institute,Degree
0,1,"Java, J2EE, Multithreading, Sql, Javascript, T...",SDE 2,Testing,36,FGH,B. Tech
1,2,"Java, J2EE, Multithreading, Sql, Regression Te...",SDE,Testing,10,NIT,MCA
2,3,"Machine Learning, Java, Weka",Team Lead,Engineering,65,XYZ,B. Tech
3,4,"Machine Learning, NLP, ML, Stanford NLP, CNN, ...",AVP,Engineering,55,ABC,B. Tech
4,5,"Machine Learning, AI, NLP, Java, Javascript, T...",Co-founder and CTO,Engineering,20,IIT-B,B. Tech
5,6,"Machine Learning, NLP, ML, Stanford NLP, Pytho...",SDE 2,Engineering,28,IIT-D,B. Tech
6,7,"Java, J2EE, Multithreading, Sql, Javascript",Software Developer,Testing,24,DEF,MCA
7,8,"Sql, C#, Javascript, MVC",SDE,Engineering,36,XYZ,B. Tech
8,9,"Sql, C#, Javascript",SDE,Engineering,28,IIT-D,B. Tech
9,10,"Sql, C#, Javascript, WPF",SDE,Engineering,50,NIT-K,B. Tech


Lets see how the result should look like : 

In [8]:
mapping

Unnamed: 0,Job_Id,Relevent_CV_Ids
0,1,"7, 2"
1,2,"1, 7"
2,3,"4, 3"
3,4,"9, 8"
4,5,"10, 9"
5,6,11
6,7,"14, 12"
7,8,"5, 4"
8,9,"15, 3, 4, 6, 5"
9,10,"15, 3, 4, 6"


### Part C : Data Modification

Let us make the necessary data modifications and imputations for our job recommender engine.

#### Subpart 1 : Fill missing values with zero

Here there are only two columns which have missing values, which are the experience fields. Hence lets directly fill them with 0.

In [9]:
jobs = jobs.fillna(0)

Let us see what jobs now looks like : 

In [10]:
jobs

Unnamed: 0,Job_Id,Designation,Required Skills,Min Exp (in months),Max Exp (in months)
0,1,SDE Testing,"Java, J2EE, Sql, Javascript",12.0,36.0
1,2,SDE II Testing,"J2EE, Multithreading, Javascript, Regression T...",24.0,0.0
2,3,AVP,"Team Leading, MVC, Design, Feasability Analysi...",60.0,0.0
3,4,SDE,"C#, Sql Server, Javascript",0.0,0.0
4,5,SDE II,"C#, Asp.Net, WPF, Framework 4",36.0,0.0
5,6,Team Lead,"ASP.NET, Team Leading, Blueprint Creation, Res...",84.0,0.0
6,7,Program Manager,"Presentation Skills, Leadership qualities, Tea...",120.0,0.0
7,8,CTO,"Leadership Qualities, Entrepreneurial Spirit",0.0,0.0
8,9,Data Scientist,"Machine Learning, Java, Python",0.0,0.0
9,10,Lead Data Scientist,"Machine Learning, Deep Learning, NLP, RNN, LST...",36.0,0.0


#### Subpart 2 : Modifying the dataframes to lower case for better string matching

In [11]:
jobs_categorical_cols = list(jobs.select_dtypes(exclude = ['int64', 'float64']).columns)
candidates_categorical_cols = list(candidates.select_dtypes(exclude = ['int64', 'float64']).columns)

In [12]:
jobs[jobs_categorical_cols] = jobs[jobs_categorical_cols].apply(lambda x : x.str.lower())
candidates[candidates_categorical_cols] = candidates[candidates_categorical_cols].apply(lambda x : x.str.lower())

Let us now see what these dataframes look like : 

In [13]:
jobs[jobs_categorical_cols] = jobs[jobs_categorical_cols].apply(lambda x : x.str.replace(' ','')) 
candidates[candidates_categorical_cols] = candidates[candidates_categorical_cols].apply(lambda x : x.str.replace(' ',''))

In [14]:
jobs

Unnamed: 0,Job_Id,Designation,Required Skills,Min Exp (in months),Max Exp (in months)
0,1,sdetesting,"java,j2ee,sql,javascript",12.0,36.0
1,2,sdeiitesting,"j2ee,multithreading,javascript,regressiontesting",24.0,0.0
2,3,avp,"teamleading,mvc,design,feasabilityanalysis,jav...",60.0,0.0
3,4,sde,"c#,sqlserver,javascript",0.0,0.0
4,5,sdeii,"c#,asp.net,wpf,framework4",36.0,0.0
5,6,teamlead,"asp.net,teamleading,blueprintcreation,resource...",84.0,0.0
6,7,programmanager,"presentationskills,leadershipqualities,teamlea...",120.0,0.0
7,8,cto,"leadershipqualities,entrepreneurialspirit",0.0,0.0
8,9,datascientist,"machinelearning,java,python",0.0,0.0
9,10,leaddatascientist,"machinelearning,deeplearning,nlp,rnn,lstm,python",36.0,0.0


In [15]:
candidates

Unnamed: 0,CV_Id,Skills,Designation,Industry,Experience (in months),Institute,Degree
0,1,"java,j2ee,multithreading,sql,javascript,testca...",sde2,testing,36,fgh,b.tech
1,2,"java,j2ee,multithreading,sql,regressiontesting",sde,testing,10,nit,mca
2,3,"machinelearning,java,weka",teamlead,engineering,65,xyz,b.tech
3,4,"machinelearning,nlp,ml,stanfordnlp,cnn,rnn,lst...",avp,engineering,55,abc,b.tech
4,5,"machinelearning,ai,nlp,java,javascript,teamlea...",co-founderandcto,engineering,20,iit-b,b.tech
5,6,"machinelearning,nlp,ml,stanfordnlp,python,tens...",sde2,engineering,28,iit-d,b.tech
6,7,"java,j2ee,multithreading,sql,javascript",softwaredeveloper,testing,24,def,mca
7,8,"sql,c#,javascript,mvc",sde,engineering,36,xyz,b.tech
8,9,"sql,c#,javascript",sde,engineering,28,iit-d,b.tech
9,10,"sql,c#,javascript,wpf",sde,engineering,50,nit-k,b.tech


#### Subpart 3 : Making a vocabulary of skills

In [16]:
skills_vocab = jobs['Required Skills'].tolist()
skills_vocab = ','.join(skills_vocab)
skills_vocab = skills_vocab.split(',')
skills_vocab = [i.strip() for i in skills_vocab]
skills_vocab = [j.replace(' ','') for j in skills_vocab]
skills_vocab = list(set(skills_vocab))

Let us see what this vocabulary looks like : 

In [17]:
skills_vocab

['leadershipqualities',
 'c#',
 'wpf',
 'sqlserver',
 'opennlp',
 'stanfordnlp',
 'entrepreneurialspirit',
 'cnn',
 'javascript',
 'blueprintcreation',
 'design',
 'presentationskills',
 'resourceplanning',
 'rnn',
 'feasabilityanalysis',
 'mvc',
 'regressiontesting',
 'multithreading',
 'sql',
 'java',
 'lstm',
 'machinelearning',
 'python',
 'j2ee',
 'nlp',
 'asp.net',
 'teamleading',
 'deeplearning',
 'ai',
 'framework4']

Now that we have got our vocabulary, let us use our tokenizer : 

In [18]:
countvec = CountVectorizer(vocabulary = skills_vocab, tokenizer = tokenize)

#### Subpart 4 : Making the user-item and item-item matrix

Let us now make our job and candidate matrix according to the skills vocabulary : 

In [19]:
jobs_matrix = pd.DataFrame(countvec.fit_transform(jobs['Required Skills']).toarray(), index = jobs['Job_Id'], columns = countvec.get_feature_names()).reset_index()
candidates_matrix = pd.DataFrame(countvec.fit_transform(candidates['Skills']).toarray(), index = candidates['CV_Id'], columns = countvec.get_feature_names()).reset_index()

Merge this with the main dataframes : 

In [20]:
item_item_matrix = pd.merge(jobs, jobs_matrix, on = 'Job_Id')
user_item_matrix = pd.merge(candidates, candidates_matrix, on = 'CV_Id')

Let us see how the new matrices look like : 

In [21]:
item_item_matrix

Unnamed: 0,Job_Id,Designation,Required Skills,Min Exp (in months),Max Exp (in months),leadershipqualities,c#,wpf,sqlserver,opennlp,...,lstm,machinelearning,python,j2ee,nlp,asp.net,teamleading,deeplearning,ai,framework4
0,1,sdetesting,"java,j2ee,sql,javascript",12.0,36.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2,sdeiitesting,"j2ee,multithreading,javascript,regressiontesting",24.0,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3,avp,"teamleading,mvc,design,feasabilityanalysis,jav...",60.0,0.0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,4,sde,"c#,sqlserver,javascript",0.0,0.0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,sdeii,"c#,asp.net,wpf,framework4",36.0,0.0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,1
5,6,teamlead,"asp.net,teamleading,blueprintcreation,resource...",84.0,0.0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
6,7,programmanager,"presentationskills,leadershipqualities,teamlea...",120.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,8,cto,"leadershipqualities,entrepreneurialspirit",0.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,datascientist,"machinelearning,java,python",0.0,0.0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
9,10,leaddatascientist,"machinelearning,deeplearning,nlp,rnn,lstm,python",36.0,0.0,0,0,0,0,0,...,1,1,1,0,1,0,0,1,0,0


In [22]:
user_item_matrix

Unnamed: 0,CV_Id,Skills,Designation,Industry,Experience (in months),Institute,Degree,leadershipqualities,c#,wpf,...,lstm,machinelearning,python,j2ee,nlp,asp.net,teamleading,deeplearning,ai,framework4
0,1,"java,j2ee,multithreading,sql,javascript,testca...",sde2,testing,36,fgh,b.tech,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2,"java,j2ee,multithreading,sql,regressiontesting",sde,testing,10,nit,mca,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3,"machinelearning,java,weka",teamlead,engineering,65,xyz,b.tech,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,4,"machinelearning,nlp,ml,stanfordnlp,cnn,rnn,lst...",avp,engineering,55,abc,b.tech,0,0,0,...,1,1,1,0,1,0,0,0,0,0
4,5,"machinelearning,ai,nlp,java,javascript,teamlea...",co-founderandcto,engineering,20,iit-b,b.tech,0,0,0,...,0,1,0,0,1,0,1,0,1,0
5,6,"machinelearning,nlp,ml,stanfordnlp,python,tens...",sde2,engineering,28,iit-d,b.tech,0,0,0,...,0,1,1,0,1,0,0,0,0,0
6,7,"java,j2ee,multithreading,sql,javascript",softwaredeveloper,testing,24,def,mca,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,8,"sql,c#,javascript,mvc",sde,engineering,36,xyz,b.tech,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,9,"sql,c#,javascript",sde,engineering,28,iit-d,b.tech,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,10,"sql,c#,javascript,wpf",sde,engineering,50,nit-k,b.tech,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [23]:
jobs_matrix

Unnamed: 0,Job_Id,leadershipqualities,c#,wpf,sqlserver,opennlp,stanfordnlp,entrepreneurialspirit,cnn,javascript,...,lstm,machinelearning,python,j2ee,nlp,asp.net,teamleading,deeplearning,ai,framework4
0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,4,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
5,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
6,7,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,8,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
9,10,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,0,0,1,0,0


In [24]:
candidates_matrix

Unnamed: 0,CV_Id,leadershipqualities,c#,wpf,sqlserver,opennlp,stanfordnlp,entrepreneurialspirit,cnn,javascript,...,lstm,machinelearning,python,j2ee,nlp,asp.net,teamleading,deeplearning,ai,framework4
0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,1,0,1,0,...,1,1,1,0,1,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,1,0,1,0
5,6,0,0,0,0,0,1,0,0,0,...,0,1,1,0,1,0,0,0,0,0
6,7,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
7,8,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,9,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,10,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


#### Subpart 5 : Making a feature from Designation and Industry

We can add more features like designation. For this reason we shall be concatenating designation and industry from the candidate frame and do a fuzzy string match with designation from jobs dataframe.

In [25]:
master_rows = jobs['Designation'].tolist()
master_columns = candidates['Designation'] + candidates['Industry'].tolist()
string_dist = np.zeros((len(master_rows), len(master_columns)))

In [26]:
for i in range(0, len(master_rows)):
    a = master_rows[i]
    for j in range(0, len(master_columns)):
        b = master_columns[j]
        string_dist[i,j] = fuzz.ratio(a,b)

In [27]:
result = pd.DataFrame(string_dist)  
result.index = master_rows
result.columns = master_columns
result

Designation,sde2testing,sdetesting,teamleadengineering,avpengineering,co-founderandctoengineering,sde2engineering,softwaredevelopertesting,sdeengineering,sdeengineering.1,sdeengineering.2,teamleadengineering.1,managermanagement,asst.managermanagement,asst.managermanagement.1,datascientistengineering,softwaredeveloperengineering
sdetesting,95.0,100.0,41.0,42.0,38.0,56.0,59.0,58.0,58.0,58.0,41.0,22.0,31.0,31.0,47.0,37.0
sdeiitesting,87.0,91.0,45.0,46.0,36.0,59.0,56.0,62.0,62.0,62.0,45.0,21.0,29.0,29.0,44.0,40.0
avp,0.0,0.0,9.0,35.0,7.0,0.0,22.0,0.0,0.0,0.0,9.0,10.0,8.0,8.0,7.0,19.0
sde,43.0,46.0,18.0,12.0,13.0,33.0,22.0,35.0,35.0,35.0,18.0,10.0,16.0,16.0,15.0,19.0
sdeii,50.0,53.0,33.0,32.0,25.0,50.0,28.0,53.0,53.0,53.0,33.0,9.0,15.0,15.0,28.0,30.0
teamlead,21.0,22.0,59.0,18.0,17.0,17.0,25.0,18.0,18.0,18.0,59.0,32.0,33.0,33.0,19.0,22.0
programmanager,16.0,17.0,42.0,36.0,34.0,28.0,26.0,29.0,29.0,29.0,42.0,52.0,44.0,44.0,32.0,33.0
cto,14.0,15.0,9.0,0.0,20.0,0.0,15.0,0.0,0.0,0.0,9.0,10.0,8.0,8.0,15.0,13.0
datascientist,42.0,43.0,31.0,30.0,30.0,29.0,32.0,30.0,30.0,30.0,31.0,33.0,34.0,34.0,70.0,24.0
leaddatascientist,36.0,37.0,39.0,26.0,32.0,25.0,29.0,26.0,26.0,26.0,39.0,35.0,31.0,31.0,63.0,22.0


Now that we have the fuzzy score for each term, lets binarize the max score : 

In [28]:
result = result.apply(lambda x : x == x.max(), axis = 'columns').astype(int)

In [29]:
result

Designation,sde2testing,sdetesting,teamleadengineering,avpengineering,co-founderandctoengineering,sde2engineering,softwaredevelopertesting,sdeengineering,sdeengineering.1,sdeengineering.2,teamleadengineering.1,managermanagement,asst.managermanagement,asst.managermanagement.1,datascientistengineering,softwaredeveloperengineering
sdetesting,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
sdeiitesting,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
avp,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
sde,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
sdeii,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0
teamlead,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
programmanager,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
cto,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
datascientist,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
leaddatascientist,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


Please note that to be less restrictive, we can use the option of binarizing the top two terms rather than just the topmost scoring terms. To remove confusion, lets rename the columns to the respective cv ids so we know which keywords we will be mapping to which cvs.

In [30]:
result.columns = candidates['CV_Id']
result = result.T
result = result.reset_index()
result

Unnamed: 0,CV_Id,sdetesting,sdeiitesting,avp,sde,sdeii,teamlead,programmanager,cto,datascientist,leaddatascientist,principaldatascientist,sdeii.1
0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,1,0,1,1,0,0,0,0,0,0,1
2,3,0,0,0,0,0,1,0,0,0,0,0,0
3,4,0,0,1,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,1,0,0,0,0
5,6,0,0,0,0,0,0,0,0,0,0,0,0
6,7,0,0,0,0,0,0,0,0,0,0,0,0
7,8,0,0,0,0,1,0,0,0,0,0,0,1
8,9,0,0,0,0,1,0,0,0,0,0,0,1
9,10,0,0,0,0,1,0,0,0,0,0,0,1


Now lets merge this back with the candidate matrix :

In [31]:
candidates_matrix = pd.merge(candidates_matrix, result, on = 'CV_Id')
candidates_matrix

Unnamed: 0,CV_Id,leadershipqualities,c#,wpf,sqlserver,opennlp,stanfordnlp,entrepreneurialspirit,cnn,javascript,...,avp,sde,sdeii,teamlead,programmanager,cto,datascientist,leaddatascientist,principaldatascientist,sdeii.1
0,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,1
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,4,0,0,0,0,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
5,6,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,8,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
8,9,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
9,10,0,1,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1


Likewise lets modify the job matrix with the keywords we derived.

In [32]:
jobs_vocab = list(set(list(jobs['Designation'])))

In [33]:
countvec = CountVectorizer(vocabulary = jobs_vocab)

In [34]:
result = pd.DataFrame(countvec.fit_transform(jobs['Designation']).toarray(), index = jobs['Job_Id'], columns = countvec.get_feature_names()).reset_index()

In [35]:
jobs_matrix = pd.merge(jobs_matrix, result, on = 'Job_Id')
jobs_matrix = jobs_matrix.reset_index()

### Part D : Cosine Similarity and Ranking

#### Subpart 1 : Finding the cosine similarity

In [38]:
similarity = cosine_similarity(jobs_matrix.drop(['Job_Id'], axis = 1), candidates_matrix.drop(['CV_Id'], axis = 1).T)
similarity.columns = ['CV_Id' + string for string in list(map(str,list(range(1,(len(candidates['CV_Id'])+1)))))]
similarity.index = ['Job_Id' + string for string in list(map(str,list(range(1,(len(jobs['Job_Id'])+1)))))]
similarity

Unnamed: 0,CV_Id1,CV_Id2,CV_Id3,CV_Id4,CV_Id5,CV_Id6,CV_Id7,CV_Id8,CV_Id9,CV_Id10,CV_Id11,CV_Id12,CV_Id13,CV_Id14,CV_Id15,CV_Id16
Job_Id1,0.004535,0.004535,0.004535,0.00907,0.00907,0.004535,0.004535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00907,0.004535
Job_Id2,0.00907,0.00907,0.0,0.004535,0.004535,0.004535,0.00907,0.004535,0.004535,0.004535,0.004535,0.004535,0.004535,0.004535,0.004535,0.00907
Job_Id3,0.0,0.004535,0.0,0.00907,0.0,0.004535,0.0,0.004535,0.0,0.0,0.004535,0.018141,0.013605,0.013605,0.0,0.0
Job_Id4,0.0,0.004535,0.0,0.0,0.0,0.0,0.0,0.004535,0.004535,0.00907,0.004535,0.013605,0.013605,0.013605,0.0,0.0
Job_Id5,0.0,0.004535,0.0,0.004535,0.004535,0.0,0.0,0.0,0.0,0.004535,0.004535,0.022676,0.018141,0.018141,0.0,0.0
Job_Id6,0.0,0.0,0.004535,0.004535,0.004535,0.0,0.0,0.0,0.0,0.0,0.004535,0.027211,0.022676,0.022676,0.0,0.0
Job_Id7,0.0,0.0,0.0,0.004535,0.004535,0.0,0.0,0.004535,0.004535,0.004535,0.004535,0.031746,0.027211,0.031746,0.0,0.0
Job_Id8,0.0,0.0,0.0,0.004535,0.0,0.0,0.0,0.004535,0.004535,0.004535,0.004535,0.031746,0.031746,0.031746,0.004535,0.0
Job_Id9,0.004535,0.00907,0.0,0.00907,0.0,0.004535,0.004535,0.004535,0.004535,0.004535,0.0,0.036281,0.036281,0.036281,0.0,0.004535
Job_Id10,0.004535,0.00907,0.004535,0.00907,0.00907,0.00907,0.004535,0.0,0.0,0.0,0.0,0.040816,0.040816,0.040816,0.004535,0.004535


#### Subpart 2 : Ranking candidates according to job requiredments

In [39]:
ranking = pd.DataFrame(similarity.columns[np.argsort(-similarity.values, axis = 1)[:,:len(candidates['CV_Id'])]], index=similarity.index)
ranking = ranking.rename(columns = lambda x: 'Top_{}'.format(x + 1))

In [40]:
ranking

Unnamed: 0,Top_1,Top_2,Top_3,Top_4,Top_5,Top_6,Top_7,Top_8,Top_9,Top_10,Top_11,Top_12,Top_13,Top_14,Top_15,Top_16
Job_Id1,CV_Id4,CV_Id5,CV_Id15,CV_Id1,CV_Id2,CV_Id3,CV_Id6,CV_Id7,CV_Id16,CV_Id8,CV_Id9,CV_Id10,CV_Id11,CV_Id12,CV_Id13,CV_Id14
Job_Id2,CV_Id1,CV_Id2,CV_Id7,CV_Id16,CV_Id4,CV_Id5,CV_Id6,CV_Id8,CV_Id9,CV_Id10,CV_Id11,CV_Id12,CV_Id13,CV_Id14,CV_Id15,CV_Id3
Job_Id3,CV_Id12,CV_Id13,CV_Id14,CV_Id4,CV_Id2,CV_Id6,CV_Id8,CV_Id11,CV_Id1,CV_Id3,CV_Id5,CV_Id7,CV_Id9,CV_Id10,CV_Id15,CV_Id16
Job_Id4,CV_Id12,CV_Id13,CV_Id14,CV_Id10,CV_Id2,CV_Id8,CV_Id9,CV_Id11,CV_Id1,CV_Id3,CV_Id4,CV_Id5,CV_Id6,CV_Id7,CV_Id15,CV_Id16
Job_Id5,CV_Id12,CV_Id13,CV_Id14,CV_Id2,CV_Id4,CV_Id5,CV_Id10,CV_Id11,CV_Id1,CV_Id3,CV_Id6,CV_Id7,CV_Id8,CV_Id9,CV_Id15,CV_Id16
Job_Id6,CV_Id12,CV_Id13,CV_Id14,CV_Id3,CV_Id4,CV_Id5,CV_Id11,CV_Id1,CV_Id2,CV_Id6,CV_Id7,CV_Id8,CV_Id9,CV_Id10,CV_Id15,CV_Id16
Job_Id7,CV_Id12,CV_Id14,CV_Id13,CV_Id4,CV_Id5,CV_Id8,CV_Id9,CV_Id10,CV_Id11,CV_Id1,CV_Id2,CV_Id3,CV_Id6,CV_Id7,CV_Id15,CV_Id16
Job_Id8,CV_Id12,CV_Id13,CV_Id14,CV_Id4,CV_Id8,CV_Id9,CV_Id10,CV_Id11,CV_Id15,CV_Id1,CV_Id2,CV_Id3,CV_Id5,CV_Id6,CV_Id7,CV_Id16
Job_Id9,CV_Id12,CV_Id13,CV_Id14,CV_Id2,CV_Id4,CV_Id1,CV_Id6,CV_Id7,CV_Id8,CV_Id9,CV_Id10,CV_Id16,CV_Id3,CV_Id5,CV_Id11,CV_Id15
Job_Id10,CV_Id12,CV_Id13,CV_Id14,CV_Id2,CV_Id4,CV_Id5,CV_Id6,CV_Id1,CV_Id3,CV_Id7,CV_Id15,CV_Id16,CV_Id8,CV_Id9,CV_Id10,CV_Id11
