In [None]:
def make_skills_pair(esco,onet):
  '''This function is used to create the different esco-onet skills pair.'''
  esco_list = esco #list of skills
  onet_list = onet #list of skills
  pairs_list = [(a, b) for a in esco_list 
      for b in onet_list]
  return pairs_list    

# Examples : Occupation A (ESCO)  has skills [1,2,3]
#            Occupation B (O*NET) has skills [1,4,5]
#            pairs_list  = {(1,1),(1,4),(1,5),(2,1),(2,4),(2,5),(3,1),(3,4),(3,5)}

In [None]:
def topfives(matrix, threshold=0.6):
    """
    for each row in the given matrix, identify the 5 highest values it contains
    check whether those values are above threshold, and if yes: get column index
    return dictionary with row indices as keys and corresponding column indices 
    as values (i.e. {1:[2,5,8,11]}) 
    """
    topfives = {}
    for i in range(len(matrix)):
        top6 = []
        for j in range(6):
            max_ = np.amax(matrix[i])
            k = np.where(matrix[i] == max_)
            c = k[0][0]
            if max_ > float(threshold):
                if len(top6) < 5:
                    top6.append(int(c))
                    matrix[i][c] = 0
        topfives[i] = top6
    return topfives


def index2label(top5s, esco_labels, onet_labels):
    """
    convert the row/column index to the corresponding occupation label
    return dictionary with esco labels as keys and corresponding onet labels as
    values
    """
    mappings = {}
    for key, value in top5s.items():
        esco = esco_labels[key]
        onet = [onet_labels[int(v)] for v in value]
        mappings[esco] = onet
    return mappings

In [None]:
def max_sim_with_skills(esco, onet,d_weights,l_weights,s_weights,al_weights):
    """
    compare all esco labels, descriptions and alternate labels to all onet labels, descriptions and alternate labels 
    Also, the skills score is calculated. 
    First the skills pairs set is prepared using the make_skills_pairs function. 
    The number of pairs in this set present in the main skills pairs created earlier is considered.  
    Example: Occupation A (ESCO)  has skills [1,2]
             Occupation B (O*NET) has skills [1,4]
             pairs_list  = {(1,1),(1,4),(2,1),(2,4)} 
             all_skills_pairs (look-up table) = [(1,1),(2,3)....] 
             since 1,1 is the pair which exist in all_skills pairs, it is considered as a match. 
             The number of pairs = 1. Number of skills in A = 2
             Score = 1/2 
    esco = dataframe with label embeddings, description embeddings, skills list of esco(europe)
    onet = dataframe with label embeddings, description embeddings, skills list of onet(USA)
    d_weights = weights for description
    l_weights = weights for label
    s_weights = weights for skills
    al_weights = weights for alternate labels
    all_skills_pairs = look-up table
    """

    matches = np.zeros((esco.shape[0], onet.shape[0]))
    for i,esco_row in tqdm(esco.iterrows()):

      for j,onet_row in onet.iterrows():
        d_score = get_sts_score(esco_row['description_embeddings'],onet_row['description_embeddings'])
        l_score = get_sts_score(esco_row['label_embeddings'],onet_row['label_embeddings'])
        l2_score = get_sts_score(esco_row['alt_label_embeddings'],onet_row['alt_label_embeddings'])
        output = make_skills_pair(esco_row['required_skills'],onet_row['required_skills'])
        sskill_score = len(set(output).intersection(set(all_skills_pairs)))/len(set(esco_row['required_skills']))
        temp_score = (d_score*d_weights + l_score*l_weights + sskill_score*s_weights + l2_score*al_weights)/10
        matches[i][j] = temp_score
    return matches