# Finetune SpaCy NER

In [None]:
import os
import json
import random
import numpy as np
import pandas as pd
from time import time
import warnings

import spacy
from spacy.util import minibatch, compounding

### Load NLP and Data

In [None]:
nlp = spacy.load("../input/coleridge-ner-chain-v02-c03/coleridge_ner")
with open("../input/coleridgecuratednerlabels002/train-1234.json") as rf:
    train_dict = json.load(rf)
train_df = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")

### Sampling Rates

In [None]:
dsets = train_df['dataset_label'].value_counts().to_frame().reset_index()
dsets.columns = ['dataset_label', "N"]
dsets.loc[:, "log_N"] = np.log(dsets["N"])
dsets.loc[:, "sample_rate"] = 0.99 - dsets["log_N"]/9.5

train_id = pd.merge(train_df[["Id", "dataset_label"]], dsets[["dataset_label", "sample_rate"]], on="dataset_label")
train_id = train_id.groupby("Id")["sample_rate"].mean().reset_index()
train_id.loc[:, "sample_rate"] = np.round(train_id["sample_rate"], 2)

### Tuning Config

In [None]:
N_it = 20
b0 = 24
b1 = 192
r = 1.0025
drop_rate = 0.11
seed = 121

### Data Selection

In [None]:
np.random.seed(seed)
train_set = []
for sr in np.unique(train_id.sample_rate):
    
    id_list = train_id.query("sample_rate == @sr")["Id"].tolist()
    train_data = []
    for k, v in train_dict.items():
        if k in id_list:
            for x in v:
                train_data.append((x['example'][0], x['example'][1]))
                
    N_eg = int(sr * len(train_data))
    train_idx = np.random.choice(range(len(train_data)), size=min(len(train_data), N_eg), replace=False)
    train_sub = [train_data[i] for i in train_idx]
    train_set += train_sub

len(train_set)

### Training Prep

In [None]:
ner = nlp.get_pipe("ner")
for _, annotations in train_set:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

### Fine-tuning

In [None]:
with nlp.disable_pipes(*unaffected_pipes):
    
    # disable warnings
    warnings.filterwarnings("ignore", category=UserWarning, module='spacy') 
    
    # Training for N iterations
    for it_num in range(N_it):

        # shuffling examples  before every iteration
        t0 = time()
        random.shuffle(train_set)
        losses = {}

        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_set, size=compounding(b0, b1, r))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, drop=drop_rate, losses=losses)
        
        # loss progress
        print("Round %d loss: %.2f (%.1f seconds)"%(it_num, losses['ner'], time() - t0))

# save to disk        
nlp.to_disk("coleridge_ner")

***
<br><br><br><br><br><br><br><br><br><br><br><br><br>