In [1]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import pickle
import pdb

In [2]:
data = pickle.load(open('./data/restructured_dataset.pkl', 'rb'))

In [3]:
def process_label(label):
    if label[0] == '/':
        return label.split('/')[1]
    else:
        return label

In [4]:
# another prototype
features = []
labels = []
ids = []

test_nodes = []
minimum_number_of_test_nodes_wanted = 100
treat_as_test_node = False
for citing_id in data['edges'].keys():
    cited_ids = data['edges'][citing_id]
    if len(test_nodes) < minimum_number_of_test_nodes_wanted:
        treat_as_test_node = True
    if citing_id in data['features_for_id'].keys() and citing_id in data['label_for_id'].keys():
        for i in range(len(cited_ids)):
            if cited_ids[i] in data['features_for_id'].keys() and cited_ids[i] in data['label_for_id'].keys():
                ids.append([citing_id, cited_ids[i]])
                the_elements = [
                    data['features_for_id'][citing_id], data['features_for_id'][cited_ids[i]]
                ]
                the_labels = [
                    process_label(data['label_for_id'][citing_id]), process_label(data['label_for_id'][cited_ids[i]])
                ]
                if the_labels[0] != "NOLABEL" and the_labels[1] != "NOLABEL":
                    features.append(the_elements)
                    labels.append(the_labels)
                    if treat_as_test_node:
                        test_nodes += [citing_id, cited_ids[i]]
                        test_nodes = list(set(test_nodes))
    treat_as_test_node = False

In [5]:
len(data['edges'].keys())

35788

In [6]:
def divide_to_test_and_train(features, labels, test_nodes=None, ids=None):
    if test_nodes is None:
        last_train_index = 80000
        features_train = features[:last_train_index]
        labels_train = labels[:last_train_index]
        features_test = features[last_train_index:]
        labels_test = labels[last_train_index:]
    else:
        assert(ids is not None)
        features_train = []
        labels_train = []
        features_test = []
        labels_test = []
        
        for i in range(len(features)):
            if (ids[i][0] in test_nodes) or (ids[i][1] in test_nodes):
                features_test.append(features[i])
                labels_test.append(labels[i])
            else:
                features_train.append(features[i])
                labels_train.append(labels[i])
    return features_train, labels_train, features_test, labels_test

In [7]:
features_train, labels_train, features_test, labels_test = divide_to_test_and_train(features, labels, test_nodes, ids)

In [8]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(features_train, labels_train):
    try:
        trainer.append(xseq, yseq)
    except:
        pdb.set_trace()

CPU times: user 1.08 s, sys: 20.3 ms, total: 1.1 s
Wall time: 1.1 s


In [9]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [10]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [11]:
%%time
trainer.train('cora.crfsuite')

CPU times: user 3.96 s, sys: 82.7 ms, total: 4.04 s
Wall time: 4.06 s


In [12]:
!ls -lh ./cora.crfsuite

-rw-r--r--  1 davina  staff   1.2M Mar 18 17:21 ./cora.crfsuite


In [13]:
trainer.logparser.last_iteration

{'num': 50,
 'scores': {},
 'loss': 17764.00572,
 'feature_norm': 141.38742,
 'error_norm': 143.218606,
 'active_features': 18353,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.066}

In [14]:
tagger = pycrfsuite.Tagger()
tagger.open('cora.crfsuite')

<contextlib.closing at 0x1a2a1777f0>

In [15]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
Artificial_Intelligence -> Artificial_Intelligence 4.801341
Programming -> Programming 2.313652
Hardware_and_Architecture -> Hardware_and_Architecture 2.099289
Encryption_and_Compression -> Encryption_and_Compression 2.080741
Networking -> Networking 2.076263
Data_Structures__Algorithms_and_Theory -> Data_Structures__Algorithms_and_Theory 1.884011
Information_Retrieval -> Information_Retrieval 1.868383
Human_Computer_Interaction -> Human_Computer_Interaction 1.846793
Databases -> Databases 1.758569
Information_Retrieval -> Artificial_Intelligence 1.340177
Data_Structures__Algorithms_and_Theory -> Artificial_Intelligence 1.300631
Artificial_Intelligence -> Information_Retrieval 1.240355
Artificial_Intelligence -> Data_Structures__Algorithms_and_Theory 1.141417
Artificial_Intelligence -> Programming 1.060729
Programming -> Artificial_Intelligence 1.057742

Top unlikely transitions:
Networking -> Databases -1.138574
Databases -> Networking -1.171651
Encryption_and_

In [16]:
def full_classification_report(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [17]:
%%time
y_pred = [tagger.tag(xseq) for xseq in features_test]

CPU times: user 32.2 ms, sys: 5.33 ms, total: 37.6 ms
Wall time: 36.2 ms


In [18]:
print(full_classification_report(labels_test, y_pred))

                                        precision    recall  f1-score   support

               Artificial_Intelligence       0.94      0.98      0.96      1040
Data_Structures__Algorithms_and_Theory       0.96      0.93      0.94       281
                             Databases       0.97      0.96      0.97       133
            Encryption_and_Compression       0.94      0.85      0.89       138
             Hardware_and_Architecture       0.98      0.81      0.89        78
            Human_Computer_Interaction       0.88      0.90      0.89       166
                 Information_Retrieval       0.93      0.80      0.86        35
                            Networking       0.90      0.96      0.93       139
                     Operating_Systems       0.97      0.92      0.95       442
                           Programming       0.96      0.96      0.96       454

                             micro avg       0.94      0.94      0.94      2906
                             macro avg