In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# path = "/content/drive/MyDrive/CSCI544/project/"

In [None]:
# Install the latest Tensorflow version.
!pip install -q "transformers==4.9.2"
!pip install -q "datasets==1.11.0"

In [4]:
from ast import literal_eval
import json

In [5]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [6]:
def bert_model(question, context):
    #Model
    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    #Tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    
    encoding = tokenizer.encode_plus(text=question,text_pair=context)
    inputs = encoding['input_ids']  #Token embeddings
    sentence_embedding = encoding['token_type_ids']  #Segment embeddings
    tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

    output = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
    start_index = torch.argmax(output.start_logits)

    end_index = torch.argmax(output.end_logits)

    answer = ' '.join(tokens[start_index:end_index+1])

    corrected_answer = ''

    for word in answer.split():
    
        #If it's a subword token
        if word[0:2] == '##':
            corrected_answer += word[2:]
        else:
            corrected_answer += ' ' + word
    

    return corrected_answer

In [15]:
def extract_edu(cont_file):
    # edu_out = open(path+"output/education.txt", 'w+')
    edu_out = open("output/education.txt", 'w+')
    edu_info = {}
    with open(cont_file, 'r') as f:
        for line in f:
            resu = list(literal_eval(line))
            
            id = resu[0]['id']
            category = resu[0]['category'].lower()
            
            for sec in resu:

                title = sec['title'].lower()

                if "education" in title :

                    ques_college = "what is your college?"
                    ques_major = "what is your major?"
                    ques_degree = "what is your degree"

                    context = ' '.join(sec['content'])
                    college = bert_model(ques_college, context)
                    major = bert_model(ques_major, context)
                    degree = bert_model(ques_degree, context)

#                     print('{}, {}, {},{}, [{}]\n'.format(id, college, major, degree, context))
                    edu_out.write('{}, {}, {}, {}, [{}]\n'.format(id, college, major, degree, context))
                    edu_info[id] = '{}, {}, {}'.format(college, major, degree)
            
    return edu_info

In [21]:
def extract_expr(expr_file):
    # exp_out = open(path+"output/exp_summary.txt", 'w+')
    exp_out = open("output/exp_summary.txt", 'w+')
    expr_info = {}
    with open(expr_file, 'r') as f:

        for line in f:
            exps = json.loads(line)
            id = exps['id']

            wkex = "I have "+str(len(exps['experiences']))+ " work experiences. "
            for exp in exps['experiences']:
                jobtitle = exp['jobtitle']
                startsdate = exp['startsdate']
                endsdate = exp['endsdate']
                if exp['jobduty']:
                    jobduty = '. '.join(exp['jobduty'])

                if endsdate:
                    wkex += "As a "+jobtitle+", I worked from " + startsdate + " to " + endsdate + ". And my major duty is "+jobduty
                elif startsdate:
                     wkex += "As a "+jobtitle+", I worked from " + startsdate + " to now. And my major duty is "+jobduty
            exp_out.write("{}|{}\n".format(id, wkex))
            expr_info[id] = wkex
            
    return expr_info

In [17]:
def extract_skill(skil_file):
    skil_info = {}
    with open(skil_file, 'r') as f:
        for line in f:
            id, skil = line.split(":")
            skil_info[id] = list(literal_eval(skil))
            
    return skil_info

In [18]:
def generate_intro(edu_info, expr_info, skil_info):
    
    all_intros = {}
#     I got my ____(bachelor/master/PhD) degree in _____(major) from ______ (college).
#     As a ___, I work from ___ (to ____), my major duty is __________.
#     I’m proficient at _____(extract from skills).

    for key in edu_info.keys():
        intro = ""
        college, major, degree = edu_info[key].split(',')
        intro += "I got my "+ degree + " in "+ major + " from "+college + ". "
        intro += expr_info[key] + " "
        intro += "I'm proficient at " + skil_info[key][0] + ", " + skil_info[key][1] + ", and " + skil_info[key][2]
        
        all_intros[key] = intro
        
    return all_intros

In [28]:
def save_intro(out_intro_file, intros):
    with open(out_intro_file, "w", encoding='utf-8') as f:
        for key, value in intros.items():
            f.write('%s, [%s\n' % (key, value))

In [25]:
if __name__ == "__main__":
    # cont_file = path+"output/resume_content.txt"
    # expr_file = path+"output/experiences.txt"
    # skil_file = path+"output/top_skill.txt"

    cont_file = "output/resume_content.txt"
    expr_file = "output/experiences.txt"
    skil_file = "output/top_skill.txt"
    
    edu_info = extract_edu(cont_file)
    expr_info = extract_expr(expr_file)
    skil_info = extract_skill(skil_file)
    
    intros = generate_intro(edu_info, expr_info, skil_info)
    # out_intro_file = path+"output/intros.txt"
    out_intro_file = "output/intros.txt"
    
    save_intro(out_intro_file, intros)