In [1]:
import json
from pathlib import Path

import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn_crfsuite import CRF
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

data_path = Path("../interim/silver_ner_training.jsonl")
print(data_path.exists())

sequences = []
with data_path.open("r") as f:
    for line in f:
        item = json.loads(line)
        sequences.append((item["tokens"], item["labels"]))

len(sequences)


True


2000

In [2]:
import re

def word_shape(token):
    # Example: "John123" -> "Xxxxddd"
    shape = []
    for ch in token:
        if ch.isupper():
            shape.append("X")
        elif ch.islower():
            shape.append("x")
        elif ch.isdigit():
            shape.append("d")
        else:
            shape.append(ch)
    return "".join(shape)

def token_features(tokens, i):
    """
    Features for token at position i in a sequence.
    """
    token = tokens[i]
    feats = {
        "bias": 1.0,
        "word.lower()": token.lower(),
        "word.isupper()": token.isupper(),
        "word.istitle()": token.istitle(),
        "word.isdigit()": token.isdigit(),
        "word.shape": word_shape(token),
        "suffix3": token[-3:],
        "prefix1": token[:1],
    }

    if i > 0:
        prev = tokens[i-1]
        feats.update({
            "-1:word.lower()": prev.lower(),
            "-1:word.istitle()": prev.istitle(),
            "-1:word.isupper()": prev.isupper(),
        })
    else:
        feats["BOS"] = True  

    if i < len(tokens) - 1:
        nxt = tokens[i+1]
        feats.update({
            "+1:word.lower()": nxt.lower(),
            "+1:word.istitle()": nxt.istitle(),
            "+1:word.isupper()": nxt.isupper(),
        })
    else:
        feats["EOS"] = True   

    return feats

def sent2features(tokens):
    return [token_features(tokens, i) for i in range(len(tokens))]

def sent2labels(labels):
    return labels


In [3]:
X_seq = []
y_seq = []

for tokens, labels in sequences:
    if len(tokens) != len(labels):
        continue  # safety check
    X_seq.append(sent2features(tokens))
    y_seq.append(sent2labels(labels))

len(X_seq), len(y_seq)


(2000, 2000)

In [4]:
n_total = len(X_seq)
n_train = int(0.8 * n_total)

X_train_seq = X_seq[:n_train]
y_train_seq = y_seq[:n_train]
X_test_seq = X_seq[n_train:]
y_test_seq = y_seq[n_train:]

n_train, len(X_test_seq)


(1600, 400)

In [5]:
crf = CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)

crf.fit(X_train_seq, y_train_seq)

y_pred_seq = crf.predict(X_test_seq)

In [6]:
from sklearn_crfsuite import metrics

labels = list(crf.classes_)
labels = [l for l in labels if l != "O"]

print("CRF classification report (per token):")
print(metrics.flat_classification_report(
    y_test_seq, y_pred_seq, labels=labels, digits=3
))

CRF classification report (per token):
              precision    recall  f1-score   support

      B-DATE      1.000     1.000     1.000      2000
      B-NAME      1.000     0.996     0.998       826
       B-SSN      1.000     1.000     1.000      2000
   B-ADDRESS      0.944     1.000     0.971        17
   I-ADDRESS      1.000     0.968     0.984        31
        B-ID      1.000     1.000     1.000       664

   micro avg      1.000     0.999     1.000      5538
   macro avg      0.991     0.994     0.992      5538
weighted avg      1.000     0.999     1.000      5538



In [7]:
X_flat_feats = []
y_flat = []

for tokens, labels in sequences:
    if len(tokens) != len(labels):
        continue
    feats_list = sent2features(tokens)
    for f, lab in zip(feats_list, labels):
        X_flat_feats.append(f)
        y_flat.append(lab)

len(X_flat_feats), len(y_flat)


(97397, 97397)

In [8]:
vec = DictVectorizer(sparse=True)
X_flat = vec.fit_transform(X_flat_feats)
X_flat.shape

(97397, 27929)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_flat, y_flat, test_size=0.2, random_state=42, stratify=y_flat
)

X_train.shape, X_test.shape

((77917, 27929), (19480, 27929))

In [10]:
from sklearn.preprocessing import LabelEncoder

# XGBoost needs numeric labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

In [11]:
svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)

y_pred_svm = svm_clf.predict(X_test)

print("SVM token-level classification report:")
print(classification_report(y_test, y_pred_svm, digits=3))

SVM token-level classification report:
              precision    recall  f1-score   support

   B-ADDRESS      0.000     0.000     0.000        14
      B-DATE      0.996     1.000     0.998      2000
        B-ID      1.000     1.000     1.000       647
      B-NAME      0.998     0.996     0.997       830
       B-SSN      0.997     0.996     0.996      2000
   I-ADDRESS      0.960     0.960     0.960        25
           O      0.999     0.999     0.999     13964

    accuracy                          0.998     19480
   macro avg      0.850     0.850     0.850     19480
weighted avg      0.998     0.998     0.998     19480





In [12]:
rf_clf = RandomForestClassifier(
    n_estimators=200,
    n_jobs=-1,
    random_state=42
)

rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

print("Random Forest token-level classification report:")
print(classification_report(y_test, y_pred_rf, digits=3))

Random Forest token-level classification report:
              precision    recall  f1-score   support

   B-ADDRESS      0.000     0.000     0.000        14
      B-DATE      0.991     1.000     0.996      2000
        B-ID      1.000     1.000     1.000       647
      B-NAME      0.978     0.996     0.987       830
       B-SSN      0.998     0.991     0.995      2000
   I-ADDRESS      0.900     0.360     0.514        25
           O      0.999     1.000     0.999     13964

    accuracy                          0.997     19480
   macro avg      0.838     0.764     0.784     19480
weighted avg      0.996     0.997     0.997     19480



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [13]:
xgb_clf = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    objective="multi:softmax",
    num_class=len(le.classes_)
)

xgb_clf.fit(X_train, y_train_enc)

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [14]:
y_pred_enc = xgb_clf.predict(X_test)
y_pred = le.inverse_transform(y_pred_enc)

In [15]:
from sklearn.metrics import classification_report
print("XGBoost token-level report:")
print(classification_report(y_test, y_pred, digits=3))

XGBoost token-level report:
              precision    recall  f1-score   support

   B-ADDRESS      0.000     0.000     0.000        14
      B-DATE      0.991     1.000     0.996      2000
        B-ID      1.000     1.000     1.000       647
      B-NAME      0.996     0.996     0.996       830
       B-SSN      0.998     0.991     0.994      2000
   I-ADDRESS      1.000     1.000     1.000        25
           O      0.999     0.999     0.999     13964

    accuracy                          0.998     19480
   macro avg      0.855     0.855     0.855     19480
weighted avg      0.997     0.998     0.997     19480



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [16]:
import pickle
from pathlib import Path

model_path = Path("../interim/crf_model.pkl")
with model_path.open("wb") as f:
    pickle.dump(crf, f)

model_path

WindowsPath('../interim/crf_model.pkl')