### Import packages

In [1]:
!pip install sklearn-crfsuite



In [2]:
import sklearn
import sklearn_crfsuite
import nltk
import json
import random
import re
from sklearn_crfsuite import metrics

### Load dataset

In [3]:
def tratamentoStringEntrada(string):

  if isinstance(string, list):
    if len(string) == 0:
      string = ""
    else:
      string = string[0]

  d = '(\.|\!|\?)'
  string = re.split('\s\W+', string)
  text = []
  for frase in string:
    subFrases = re.split(d, frase)
    sub = ""
    for i in range(len(subFrases)):
      sub += subFrases[i]
      if i + 1 < len(subFrases) and len(subFrases[i + 1]) > 0 and subFrases[i + 1][0].isdigit():
        continue
      if subFrases[i] in d:
        #if sub != "":
        text.append(sub)
        sub = ""
    if sub != "":
      text.append(sub)
  text = [re.split('(\W+)', frase) for frase in text]
  retorno = []
  for frase in text:
    while "" in frase:
      frase.remove("")
    while " " in frase:
      frase.remove(" ")
    if len(frase) > 0:
      retorno.append(frase)
  
  return retorno

def loadDataset(path, shuffleStateOrder = False):
  with open(path) as json_file:
    temp = json.load(json_file)
  data = []
  for vagaDic in temp:
    vaga = []
    keys = list(vagaDic.keys())
    if shuffleStateOrder:
      random.shuffle(keys)
    for key in keys:
      if key == 'vagaID':
        continue
      text = tratamentoStringEntrada(vagaDic[key])
      for sentence in text:
        frase = []
        for word in sentence:
          frase.append((key, word))
        vaga.append(frase)
      #vaga.append(frases)
    data.append(vaga)  

  return data

### Define methods

In [4]:
def word2features(sent, i):
    word = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        word1 = sent[i - 1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True

    return features


def frase2features(sent):
  features = []
  for i in range(len(sent)):
    features.append(word2features(sent, i))

  return features


def frase2estados(sent):
  estados = []
  for estado, observacao in sent:
    estados.append(estado)
    
  return estados


def frase2observacao(sent):
  observacoes = []
  for estado, observacao in sent:
    observacoes.append(observacao)
    
  return observacoes

### Extract features

In [9]:
%%time

data = loadDataset('../Dados/vagas.json', True)

Wall time: 29.1 s


In [10]:
%%time

X = []
y = []
for vaga in data:
  for frase in vaga:
    X.append(frase2features(frase))
  for frase in vaga:
    y.append(frase2estados(frase))

trainPerc = 0.7
trainSize = int(len(data) * trainPerc)

X_train = X[:trainSize]
y_train = y[:trainSize]

X_test = X[trainSize:]
y_test = y[trainSize:]

Wall time: 36.9 s


### Train CRF

In [11]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 15.2 s


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

### Evaluate results

In [12]:
labels = list(crf.classes_)
print(labels)
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

['local', 'salario', 'descricao', 'empresa', 'beneficios']
              precision    recall  f1-score   support

     salario      0.996     1.000     0.998    133071
  beneficios      0.930     0.941     0.936     39164
   descricao      0.906     0.937     0.921   4344159
     empresa      0.857     0.796     0.826   2022246
       local      0.947     0.817     0.877     46326

    accuracy                          0.894   6584966
   macro avg      0.927     0.898     0.911   6584966
weighted avg      0.893     0.894     0.893   6584966

