In [35]:
from fastai.text import *
import html
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import re
from spacy.lang.en import English
from spacy.symbols import ORTH

In [2]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

PATH=Path('inspire_data/')

In [3]:
CLAS_PATH=Path('inspire_data/inspire_clas')
CLAS_PATH.mkdir(exist_ok=True)

LM_PATH=Path('inspire_data/inspire_lm/')
LM_PATH.mkdir(exist_ok=True)

In [4]:
inspire_data = pd.read_pickle('inspire_data/combined_data.df')

inspire_data = inspire_data.sample(frac=1).reset_index(drop=True)
inspire_data.columns = ['text', 'labels']
inspire_data = inspire_data[['labels', 'text']]

In [6]:
itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

60002

In [36]:
class FastLoadTokenizer(Tokenizer):
    """
    Tokenizer which avoids redundant loading of spacy language model

    The FastAI Tokenizer class loads all the pipeline components of the spacy model which significantly increases
    loading time, especially when doing inference on CPU. This class inherits from the FastAI Tokenizer and is
    refactored to avoid redundant loading of the classifier.
    """
    def __init__(self):
        self.re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
        self.tok = English()
        for w in ('<eos>', '<bos>', '<unk>'):
            self.tok.tokenizer.add_special_case(w, [{ORTH: w}])

    def proc_all(self, ss):
        return [self.proc_text(s) for s in ss]

    def proc_all_mp(self, ss, ncpus=None):
        ncpus = ncpus or num_cpus() // 2
        with ProcessPoolExecutor(ncpus) as executor:
            return sum(executor.map(self.proc_all, ss), [])

### Initialize the classifier

In [7]:
bptt,em_sz,nh,nl = 70,400,1150,3
vs = len(itos)
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
bs = 9
dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.5

In [18]:
model = get_rnn_classifier(bptt, 20*70, 3, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
          layers=[em_sz*3, 50, 3], drops=[dps[4], 0.1],
          dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])

In [19]:
model.load_state_dict(torch.load(PATH / 'models/clas_2_bs9.h5', map_location=lambda storage, loc: storage))
model.reset()
model.eval()

SequentialRNN(
  (0): MultiBatchRNN(
    (encoder): Embedding(60002, 400, padding_idx=1)
    (encoder_with_dropout): EmbeddingDropout(
      (embed): Embedding(60002, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDrop(
        (module): LSTM(400, 1150)
      )
      (1): WeightDrop(
        (module): LSTM(1150, 1150)
      )
      (2): WeightDrop(
        (module): LSTM(1150, 400)
      )
    )
    (dropouti): LockedDropout(
    )
    (dropouths): ModuleList(
      (0): LockedDropout(
      )
      (1): LockedDropout(
      )
      (2): LockedDropout(
      )
    )
  )
  (1): PoolingLinearClassifier(
    (layers): ModuleList(
      (0): LinearBlock(
        (lin): Linear(in_features=1200, out_features=50, bias=True)
        (drop): Dropout(p=0.2)
        (bn): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True)
      )
      (1): LinearBlock(
        (lin): Linear(in_features=50, out_features=3, bias=True)
        (drop): Dropout(p=0.1)
        (bn): BatchNorm1

In [21]:
def numpy_softmax(x):
    if x.ndim == 1:
        x = x.reshape((1, -1))
    max_x = np.max(x, axis=1).reshape((-1, 1))
    exp_x = np.exp(x - max_x)
    return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))

In [37]:
def predict_single_text(text):

    input_string = 'xbos xfld 1 ' + text
    texts = [input_string]
    tokens = FastLoadTokenizer().proc_all(texts)
    encoded_tokens = [stoi[p] for p in tokens[0]]
    token_array = np.reshape(np.array(encoded_tokens), (-1, 1))
    token_array = Variable(torch.from_numpy(token_array))
    prediction_scores = model(token_array)
    prediction_scores_numpy = prediction_scores[0].data.cpu().numpy()

    return numpy_softmax(prediction_scores_numpy[0])[0]

In [43]:
inspire_data['prediction'] = -1
inspire_data['rejected_score'] = -1.0
inspire_data['noncore_score'] = -1.0
inspire_data['core_score'] = -1.0

In [None]:
for i, text in enumerate(inspire_data['text']):
    prediction = predict_single_text(text)
    inspire_data.at[i, 'prediction'] = np.argmax(prediction)
    inspire_data.at[i, 'rejected_score'] = prediction[0]
    inspire_data.at[i, 'noncore_score'] = prediction[1]
    inspire_data.at[i, 'core_score'] = prediction[2]
    if i % 100 ==0:
        print('{}/{}'.format(i+1, len(inspire_data)))

1/135313
101/135313
201/135313
301/135313
401/135313
501/135313
601/135313
701/135313
801/135313
901/135313
1001/135313
1101/135313
1201/135313
1301/135313


In [78]:
mismatches = inspire_data['labels'] != inspire_data['prediction']
mismatched_records = inspire_data[mismatches]

In [105]:
accuracy = (1 - len(mismatched_records) / len(inspire_data)) * 100
print('Overall accuracy: {} %'.format(accuracy))

Overall accuracy: 91.05481365426826 %


In [90]:
actual_core_predicted_rejected = inspire_data[(inspire_data['labels'] == 2) & (inspire_data['prediction'] == 0)]
actual_core_predicted_noncore = inspire_data[(inspire_data['labels'] == 2) & (inspire_data['prediction'] == 1)]
actual_noncore_predicted_rejected = inspire_data[(inspire_data['labels'] == 1) & (inspire_data['prediction'] == 0)]
actual_noncore_predicted_core = inspire_data[(inspire_data['labels'] == 1) & (inspire_data['prediction'] == 2)]
actual_rejected_predicted_noncore = inspire_data[(inspire_data['labels'] == 0) & (inspire_data['prediction'] == 1)]
actual_rejected_predicted_core = inspire_data[(inspire_data['labels'] == 0) & (inspire_data['prediction'] == 2)]

In [96]:
print(len(actual_core_predicted_rejected))
print(len(actual_core_predicted_noncore))
print(len(actual_noncore_predicted_rejected))
print(len(actual_noncore_predicted_core))
print(len(actual_rejected_predicted_noncore))
print(len(actual_rejected_predicted_core))

290
3070
1589
5063
1865
227


In [120]:
mismatched_records.columns = ['labels', 'text', 'prediction', 'rejected_score', 'noncore_score', 'core_score', 'drop']

In [122]:
mismatched_records = mismatched_records.drop(['rejected_score', 'noncore_score', 'core_score', 'drop'], axis=1)

In [125]:
copied_inspire_data = deepcopy(inspire_data)
copied_inspire_data.columns = ['labels', 'text', 'prediction', 'rejected_score', 'noncore_score', 'core_score', 'drop']
copied_inspire_data = copied_inspire_data.drop(['rejected_score', 'noncore_score', 'core_score', 'drop'], axis=1)

In [126]:
actual_core_predicted_rejected = copied_inspire_data[(copied_inspire_data['labels'] == 2) & (copied_inspire_data['prediction'] == 0)]
actual_core_predicted_noncore = copied_inspire_data[(copied_inspire_data['labels'] == 2) & (copied_inspire_data['prediction'] == 1)]
actual_noncore_predicted_rejected = copied_inspire_data[(copied_inspire_data['labels'] == 1) & (copied_inspire_data['prediction'] == 0)]
actual_noncore_predicted_core = copied_inspire_data[(copied_inspire_data['labels'] == 1) & (copied_inspire_data['prediction'] == 2)]
actual_rejected_predicted_noncore = copied_inspire_data[(copied_inspire_data['labels'] == 0) & (copied_inspire_data['prediction'] == 1)]
actual_rejected_predicted_core = copied_inspire_data[(copied_inspire_data['labels'] == 0) & (copied_inspire_data['prediction'] == 2)]

In [127]:
print(len(actual_core_predicted_rejected))
print(len(actual_core_predicted_noncore))
print(len(actual_noncore_predicted_rejected))
print(len(actual_noncore_predicted_core))
print(len(actual_rejected_predicted_noncore))
print(len(actual_rejected_predicted_core))

290
3070
1589
5063
1865
227


In [128]:
actual_core_predicted_rejected.to_csv('actual_core_predicted_rejected.csv')
actual_core_predicted_noncore.to_csv('actual_core_predicted_noncore.csv')
actual_noncore_predicted_rejected.to_csv('actual_noncore_predicted_rejected.csv')
actual_noncore_predicted_core.to_csv('actual_noncore_predicted_core.csv')
actual_rejected_predicted_noncore.to_csv('actual_rejected_predicted_noncore.csv')
actual_rejected_predicted_core.to_csv('actual_rejected_predicted_core.csv')

In [129]:
actual_core_predicted_noncore

Unnamed: 0,labels,text,prediction
15,2,Antiproton-nucleus quasi-bound states within t...,1
20,2,Integrability and duality in spin chains We co...,1
66,2,Fast and accurate inference on gravitational w...,1
81,2,Uncertainty principle on 3-dimensional manifol...,1
109,2,Dynamical screening of α-α resonant scattering...,1
268,2,$\mathcal{PT}$ -symmetric rational Calogero mo...,1
319,2,Action-angle variables for geodesic motions in...,1
352,2,Momentum space treatment of inclusive neutrino...,1
359,2,On the end stage of spherical gravitational co...,1
367,2,Recent developments in the tidal deformability...,1
