In [None]:
import src
import yaml

import sklearn_crfsuite
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm

In [None]:
configs = yaml.safe_load(open('configs/ml_model.yaml'))

In [None]:
train_data = src.utils.read_ner_data(configs['train_data_path'])
valid_data = src.utils.read_ner_data(configs['valid_data_path'])
test_data = src.utils.read_ner_data(configs['test_data_path'])

In [None]:
train_features = [src.utils.line_to_features(line) for line in tqdm(train_data[0], dynamic_ncols=True)]
test_features = [src.utils.line_to_features(line) for line in tqdm(test_data[0], dynamic_ncols=True)]

train_labels = train_data[1]
test_labels = test_data[1]

In [None]:
model = sklearn_crfsuite.CRF(
    max_iterations=100,
    all_possible_transitions=True
)

distributions = {
    'algorithm': ['lbfgs', 'l2sgd', 'ap', 'ap', 'ap'],
    'c1': [0.1, 0.15, 0.2],
    'c2': [0.05, 0.1, 0.15, 0.2]
}

def scorer(golds, preds, average='macro'):
    golds = src.utils.flatten(golds)
    preds = src.utils.flatten(preds) 
    
    return metrics.f1_score(golds, preds, average='macro', labels=['O', 'B-LOC', 'B-GRP', 'I-GRP', 'B-PROD', 'B-CORP', 'I-CORP', 'I-LOC', 'B-PER', 'I-PER', 'B-CW', 'I-CW', 'I-PROD'])

In [None]:
max_score = -1
max_config = {}

for algo in distributions['algorithm']:
    for c1 in distributions['c1']:
        for c2 in distributions['c2']:
            model = sklearn_crfsuite.CRF(
                algorithm=algo,
                c1=c1,
                c2=c2,
                max_iterations=100,
                all_possible_transitions=True
            )
            
            model.fit(train_features, train_labels)
            preds = model.predict(test_features)
            
            score = scorer(test_labels, preds)
            if score > max_score:
                max_score = score
                
                max_config['algorithm'] = algo
                max_config['c1'] = c1
                max_config['c2'] = c2
                max_config['score'] = score
                
                print('Intermediate:', max_config)

print('Final:', max_config)

In [None]:
import scipy

distributions = {
    'algorithm': ['lbfgs', 'l2sgd', 'ap', 'ap', 'ap'],
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

searcher = RandomizedSearchCV(
    model,
    distributions,
    cv=3,
    scoring=scorer,
    verbose=1,
    n_jobs=-1,
    n_iter=50,
)
searcher.fit(train_features, train_labels)