In [1]:
import os
import re
import fitz
import glob
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
modelCC = tf.keras.models.load_model('./model/traits/CC.h5')
modelCD = tf.keras.models.load_model('./model/traits/CD.h5')
modelCI = tf.keras.models.load_model('./model/traits/CI.h5')
modelCS = tf.keras.models.load_model('./model/traits/CS.h5')

modelLC = tf.keras.models.load_model('./model/traits/LC.h5')
modelLD = tf.keras.models.load_model('./model/traits/LD.h5')
modelLI = tf.keras.models.load_model('./model/traits/LI.h5')
modelLS = tf.keras.models.load_model('./model/traits/LS.h5')

modelMC = tf.keras.models.load_model('./model/traits/MC.h5')
modelMD = tf.keras.models.load_model('./model/traits/MD.h5')
modelMI = tf.keras.models.load_model('./model/traits/MI.h5')
modelMS = tf.keras.models.load_model('./model/traits/MS.h5')

nlp = spacy.load('./model/model_job')

In [3]:
def extract_text(filePath, remove_line=False):
    with fitz.open(filePath) as doc:
        finaltext = ""
        for page in doc:
            text = page.get_text()
            text = text.replace("\n"," ")
            text = text.replace("[^a-zA-Z0-9]", " ");
            text = re.sub('\W+',' ', text)
            text = re.sub('[^A-Za-z0-9]',' ', text)
            finaltext += text

        if remove_line:
            finaltext = text = re.sub('\s', " ", text)

    return finaltext

def extract_job(text, nlp):
    doc = nlp(text)

    job = set()

    for a in doc.ents:
        job.add(str(a).lower())

    return job

def seq_and_pad(sentences, tokenizer):
    sequences = tokenizer.texts_to_sequences(sentences)

    padded_sequences = pad_sequences(sequences, maxlen=30, padding='post')


    return padded_sequences

In [4]:
df = pd.read_csv('sample/train.csv')
df = df.loc[:, 'Email':'hasil_rekomendasi']
df['key'] = df['key'].astype('str')

text_token = []

for i, row in df.iterrows():
    text_token.append(df.at[i, 'key'])

tokenizer = Tokenizer(num_words=1000, oov_token='OOV')
tokenizer.fit_on_texts(text_token)

In [5]:
df = {'File': [],
      'Key': [],
      'MD': [],
      'MI': [],
      'MS': [],
      'MC': [],
      'LD': [],
      'LI': [],
      'LS': [],
      'LC': [],
      'CD': [],
      'CI': [],
      'CS': [],
      'CC': [],}

for file in glob.glob('./sample/cv/*.pdf'):
    df['File'].append(file)

    text = extract_text(file)
    job = extract_job(text, nlp)

    key = " "
    for skill in job:
        key += skill + " "
    df['Key'].append(key)

    padded_trait = np.asarray(seq_and_pad([key], tokenizer))

    df['MD'].append(round(modelMD.predict(padded_trait)[0][0]))
    df['MI'].append(round(modelMI.predict(padded_trait)[0][0]))
    df['MS'].append(round(modelMS.predict(padded_trait)[0][0]))
    df['MC'].append(round(modelMC.predict(padded_trait)[0][0]))

    df['LD'].append(round(modelLD.predict(padded_trait)[0][0]))
    df['LI'].append(round(modelLI.predict(padded_trait)[0][0]))
    df['LS'].append(round(modelLS.predict(padded_trait)[0][0]))
    df['LC'].append(round(modelLC.predict(padded_trait)[0][0]))

    df['CD'].append(round(modelCD.predict(padded_trait)[0][0]))
    df['CI'].append(round(modelCI.predict(padded_trait)[0][0]))
    df['CS'].append(round(modelCS.predict(padded_trait)[0][0]))
    df['CC'].append(round(modelCC.predict(padded_trait)[0][0]))






In [6]:
data = pd.DataFrame(df)
data

Unnamed: 0,File,Key,MD,MI,MS,MC,LD,LI,LS,LC,CD,CI,CS,CC
0,./sample/cv\1. CV - Gerdy Hasni.pdf,firmware designer electrical engineering main...,7,2,2,6,5,4,4,3,1,-3,-1,3
1,./sample/cv\Alya Ataya_CV - Alya Ataya.pdf,business development intern creative intern a...,7,3,1,6,5,6,5,4,2,-2,-5,2
2,./sample/cv\angga-cv - Angga Prasetya W.pdf,technical support administration front end en...,4,3,3,7,5,7,5,3,-1,-5,-2,4
3,./sample/cv\Curriculum Vitae (2) - Meli Siboro...,head of fundraising department profil researc...,6,2,1,6,3,8,5,3,4,-5,-3,3
4,./sample/cv\Curriculum Vitae Zaky (ver2 Oct 20...,human capital staff database administration p...,6,3,4,8,3,4,10,0,3,-1,-6,8
5,./sample/cv\CV - RAYYHAN ARIF WICAKSONO_2022 -...,hr executive officer publication documentatio...,5,2,2,7,4,7,4,3,2,-3,-3,4
6,./sample/cv\cv argha - rahadian argha.pdf,business development coordinator branch manag...,7,1,1,6,5,7,6,2,3,-5,-3,5
7,./sample/cv\CV Dwi Wijaya 07 - Dwi WIjaya.pdf,technical support customer experience managem...,5,1,2,6,4,6,6,1,2,-4,-2,4
8,./sample/cv\CV Fadhillah Zulhilmi Khibran - fa...,human resources,3,0,0,3,3,4,3,1,0,-1,-1,1
9,./sample/cv\CV Gabriel Karisoh2022 - Gabriel K...,management trainee manager trainer business d...,5,1,2,6,4,5,5,2,2,-3,-3,3


In [7]:
data.to_csv('predict.csv')