In [77]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q, connections
import pandas as pd
import numpy as np
import requests
from io import StringIO
from requests.auth import HTTPBasicAuth
from python_graphql_client import GraphqlClient
import json

def match_certifications(title):
    '''
    Given a title, match it with certifications using Elastic Search

    :param positionn_index: <int> the position index given
    :param certificate_list: <list> list of the certifications to be searched, if none is given all the certifications are searched
    :return: <DataFrame> returns all the matched certifications
    '''

    connections.create_connection(hosts=['localhost'], timeout=20)
    s = Search(using=Elasticsearch('localhost'))
    s = s.query("match", cert_description='cyber')

    q = Q('bool',should=[
                    Q('match', Title={'query':title, "minimum_should_match": "2<-25%"}),
                    Q('match', Acronym={'query':title, '_name':'implicit', 'boost':1})
                    ])

    #q = Q('bool',should=[
    #                Q('match', Title={'query':title, '_name':'explicit', "minimum_should_match": "2<-25%", 'boost':2.5}),
    #                Q('match', Title={'query':title, '_name':'implicit', 'boost':1})])

    s = Search().query(q)

    response = s.execute()
    df = pd.DataFrame(columns=['cert_id', 'cert_title', 'escoe', 'cert_description', 'institution'])

    id = []
    title = []
    descr = []
    escoe = []
    inst = []
    
    for h in response:
        id.append(h.index)
        title.append(h.Title)
        descr.append(h.Description)
        escoe.append(h.escoe)
        inst.append(h.Institution)

    df = pd.DataFrame({'id': id,'Title': title,'Description': descr,'Escoe': escoe, 'Institution':inst})
    return df

df = match_certifications('CCSPA')# R&S Connecting Networks')
df



Unnamed: 0,id,Title,Description,Escoe,Institution
0,457,Check Point Certified Security Principles Asso...,,2529,"Check Point Software Technologies, Inc."


In [61]:
f = open('data/sovren_resume_responses.json')

js = json.load(f)

js = pd.json_normalize(js).explode('sovren_response.Resume.StructuredXMLResume.LicensesAndCertifications.LicenseOrCertification')#,record_path=['sovren_response'])#,'Resume','StructuredXMLResume','LicensesAndCertifications','LicenseOrCertification'])#,meta=['_id'])
js = pd.concat([js, js.pop("sovren_response.Resume.StructuredXMLResume.LicensesAndCertifications.LicenseOrCertification").apply(pd.Series)], axis=1)

js.drop_duplicates(subset='Name',inplace=True)
js.dropna(subset='Name',inplace=True)
js = js[js['Description'].str.contains('matched to list')==True]
js

Unnamed: 0,_id.$oid,Name,Id,Description,EffectiveDate,0
0,5f68a67538ed50f189a0b119,Cisco Certified,1,certification; matched to list,,
1,5f68a95c3bd110f6c6a1af79,MCSA,1,certification; matched to list,,
2,5f6deccc92813fec7c2cd3e6,MCP,1,certification; matched to list,,
2,5f6deccc92813fec7c2cd3e6,PMP,2,certification; matched to list,,
2,5f6deccc92813fec7c2cd3e6,CCNA,3,certification; matched to list,,
...,...,...,...,...,...,...
2959,6253f413ce5da821c875deb2,LPIC1,6,certification; matched to list,,
2992,62602ee1aba28e8fa9e7fc76,CISSP Certification,1,"CISSP Certification\tAlison Technical\tDublin,...",{'FirstIssuedDate': {'Year': '2016'}},
3034,6273a841642b62250588f692,MCPD,1,certification; matched to list,,
3044,6278cfdabd677b1a3547fd1f,TOEFL certificate,1,certification; matched to list,,


In [None]:
js = js[js['Description'].str.contains('variation')==True]
js

In [None]:
non_matched = []

for i in range(len(js)):
    title = js.iloc[i]['Name']
    df = match_certifications(title)
    if df.empty: non_matched.append(title)

non_matched

In [75]:
# explicit matching
print(len(non_matched))
non_matched

163


['MCP',
 'ITIL V3',
 'MCSE',
 'MCITP',
 'ACLs',
 'Microsoft Certified System Engineer',
 'Microsoft Certified System',
 'Certified System Engineer',
 'CISSP',
 'ITIL Foundation certificate',
 'Microsoft Certified Systems',
 'CMP',
 'CCSA',
 'ITIL v3',
 'Association of Chartered Certified Accountants',
 'Chartered Certified Accountants',
 'RHCE',
 'Sun Certified Programmer',
 'Sun Certified',
 'WMS',
 'SCJP',
 'CISA',
 'FIC',
 'CCDA',
 'CCSP',
 'CITP',
 'MCSE Certification',
 'CPR',
 'Chartered Management Institute',
 'Certificate in Training',
 'PRINCE2 Certified',
 'ITIL FOUNDATION CERTIFICATE',
 'C.C.D.A',
 'ISEB Certified',
 'AFPA',
 'Compaq Accredited Systems Engineer',
 'Project Management Certificate',
 'Certificate in Computer',
 'CCDP',
 'Certificate of Secondary Education',
 'ITIL Foundation Certificate',
 'CIEH',
 'Chartered Institute of Environmental Health',
 'Guilds certificate',
 'CCEA',
 'Citrix Certified Administrator',
 'MCDBA',
 'MCTS',
 'Blackberry Certified',
 'CWNA