This notebook gives a simple combination of literal matching and Named Entity Recognition using BERT (base model from huggingface).

The training phase of the BERT model was done in another kernel: Pytorch BERT for Named Entity Recognition.

In [None]:
MAX_SAMPLE = None # set a small number for experimentation, set None for production.

In [None]:
from joblib import dump, load

pos_vectorizer = load("../input/preselection-svc/pos_vectorizer.joblib")

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

random.seed(123)
np.random.seed(456)

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]
test_sample = train.sample(100)

paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
sample_submission_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
sample_submission = pd.read_csv(sample_submission_path)

paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Preselection model

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

In [None]:
import re
import nltk
ids = train['Id'].values
contents = []
for idx, row in enumerate(train.values):
    print(idx, end='\r')
    paper_id = row[0]
    paper_cont = []
    for content_elem in papers[paper_id]:
        paper_cont.append(content_elem['text'])

    contents.append(" ".join(paper_cont))        
combined_train = train.copy()
combined_train['contents'] = contents
combined_train

In [None]:
new_contents = []
labels = []
for idx, (label, contents) in enumerate(combined_train[['cleaned_label', 'contents']].values):
    print(idx, end='\r')
    sentences = list(nltk.sent_tokenize(contents))
    sentence_labels = []
    for sentence in sentences:
        if all(clean_text(word) in clean_text(sentence) for word in label.split()):
            sentence_labels.append(1.0)
        else:
            sentence_labels.append(0.0)
    new_contents.extend(sentences)
    labels.extend(sentence_labels)

In [None]:
preselection_df = pd.DataFrame({'sentence': new_contents, 'label': labels})

In [None]:
preselection_short = preselection_df.sample(frac=0.05, weights = 1./preselection_df.groupby('label')['label'].transform('count'))

In [None]:
preselection_short.groupby('label').count()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    preselection_short['sentence'].values, preselection_short['label'].values, test_size=0.25, random_state=99)

In [None]:

def pos_convert_text(sentence):
    """Helper code to compute average word length of a name"""
    converted_data = []
    sentence_tags = []
    tags = nltk.pos_tag(sentence.split())
    for tag in tags:
        sentence_tags.append(tag[1])
    converted_data.append(" ".join(sentence_tags).replace(".",""))
    return converted_data


In [None]:
import gc
gc.collect()

In [None]:
def get_features(sentence):
    pos_sentence = pos_convert_text(row)
    features = pos_vectorizer.transform(pos_sentence).toarray().reshape(-1,14)
    capitals_frac = sum([1 for c in row if c.isupper()])/len(row)
    features = np.append(features, capitals_frac)
    return features

In [None]:
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

stop_words = set(stopwords.words('english'))
train_features = []
train_labels = []
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight = 'balanced', verbose=True), verbose=True)
for idx, (row, label) in enumerate(zip(X_train, y_train)):
    print(f"{idx} / {X_train.shape[0]}", end='\r')
    features = get_features(row)
    if(features[-1] < 0.25):
        train_features.append(features)
        train_labels.append(label)

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
train_features = np.array(train_features)

clf = make_pipeline(StandardScaler(), RandomForestClassifier(), verbose=True)
clf.fit(train_features, train_labels)

In [None]:
from joblib import dump, load
import os
dump(clf, 'presel_svc.joblib') 

In [None]:
import random
predictions = []
samples = random.sample(range(len(X_test)), 10000)
for idx, row in enumerate(X_test):
    print(f"{idx} / {X_test.shape[0]}", end='\r')
    features = get_features(row)
    predictions.append(clf.predict([features]))

In [None]:
from sklearn.metrics import fbeta_score
from sklearn.metrics import confusion_matrix
print(fbeta_score(y_test, predictions, beta=0.5, average=None))
print(confusion_matrix(y_test, predictions))