In [1]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import pickle
import pdb

In [2]:
data = pickle.load(open('./data/restructured_dataset.pkl', 'rb'))

In [3]:
def process_label(label):
    if label[0] == '/':
        return label.split('/')[1]
    else:
        return label

In [4]:
# another prototype
features = []
labels = []
ids = []

test_nodes = []
minimum_number_of_test_nodes_wanted = 100
treat_as_test_node = False
for citing_id in data['edges'].keys():
    cited_ids = data['edges'][citing_id]
    if len(test_nodes) < minimum_number_of_test_nodes_wanted:
        treat_as_test_node = True
    if citing_id in data['features_for_id'].keys() and citing_id in data['label_for_id'].keys():
        for i in range(len(cited_ids)):
            if cited_ids[i] in data['features_for_id'].keys() and cited_ids[i] in data['label_for_id'].keys():
                
                the_elements = [
                    data['features_for_id'][citing_id], data['features_for_id'][cited_ids[i]]
                ]
                the_labels = [
                    process_label(data['label_for_id'][citing_id]), process_label(data['label_for_id'][cited_ids[i]])
                ]
                if the_labels[0] != "NOLABEL" and the_labels[1] != "NOLABEL":
                    ids.append([citing_id, cited_ids[i]])
                    features.append(the_elements)
                    labels.append(the_labels)
                    if treat_as_test_node:
                        test_nodes += [citing_id, cited_ids[i]]
                        test_nodes = list(set(test_nodes))
    treat_as_test_node = False

In [5]:
len(data['edges'].keys())

35788

In [6]:
def divide_to_test_and_train(features, labels, test_nodes=None, ids=None):
    if test_nodes is None:
        last_train_index = 80000
        features_train = features[:last_train_index]
        labels_train = labels[:last_train_index]
        features_test = features[last_train_index:]
        labels_test = labels[last_train_index:]
    else:
        assert(ids is not None)
        features_train = []
        labels_train = []
        features_test = []
        labels_test = []
        
        for i in range(len(features)):
            if (ids[i][0] in test_nodes) or (ids[i][1] in test_nodes):
                features_test.append(features[i])
                labels_test.append(labels[i])
            else:
                features_train.append(features[i])
                labels_train.append(labels[i])
    return features_train, labels_train, features_test, labels_test

In [7]:
features_train, labels_train, features_test, labels_test = divide_to_test_and_train(features, labels, test_nodes, ids)

In [8]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(features_train, labels_train):
    try:
        trainer.append(xseq, yseq)
    except:
        pdb.set_trace()

CPU times: user 683 ms, sys: 14.8 ms, total: 697 ms
Wall time: 696 ms


In [9]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [10]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [11]:
%%time
trainer.train('cora.crfsuite')

CPU times: user 4.12 s, sys: 2.68 ms, total: 4.13 s
Wall time: 4.12 s


In [12]:
!ls -lh ./cora.crfsuite

-rw-rw-r-- 1 shayan shayan 1.2M Mar 19 20:40 ./cora.crfsuite


In [13]:
trainer.logparser.last_iteration

{'num': 50,
 'scores': {},
 'loss': 17616.848722,
 'feature_norm': 142.824314,
 'error_norm': 79.007628,
 'active_features': 17480,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.076}

In [14]:
tagger = pycrfsuite.Tagger()
tagger.open('cora.crfsuite')

<contextlib.closing at 0x7f8a6d171630>

In [15]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
Artificial_Intelligence -> Artificial_Intelligence 4.302239
Information_Retrieval -> Information_Retrieval 2.993907
Data_Structures__Algorithms_and_Theory -> Data_Structures__Algorithms_and_Theory 2.818890
Programming -> Programming 2.779787
Databases -> Databases 1.999635
Hardware_and_Architecture -> Hardware_and_Architecture 1.903791
Networking -> Networking 1.693120
Information_Retrieval -> Artificial_Intelligence 1.672966
Encryption_and_Compression -> Encryption_and_Compression 1.661289
Human_Computer_Interaction -> Human_Computer_Interaction 1.594626
Artificial_Intelligence -> Information_Retrieval 1.570762
Data_Structures__Algorithms_and_Theory -> Artificial_Intelligence 1.490430
Artificial_Intelligence -> Data_Structures__Algorithms_and_Theory 1.364679
Artificial_Intelligence -> Programming 1.023581
Programming -> Artificial_Intelligence 0.990691

Top unlikely transitions:
Encryption_and_Compression -> Information_Retrieval -0.961631
Human_Computer_Intera

In [16]:
def full_classification_report(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [17]:
%%time
y_pred = [tagger.tag(xseq) for xseq in features_test]

CPU times: user 25.7 ms, sys: 3.49 ms, total: 29.2 ms
Wall time: 29 ms


In [19]:
print(full_classification_report(labels_test, y_pred))

                                        precision    recall  f1-score   support

               Artificial_Intelligence       0.74      0.94      0.83       795
Data_Structures__Algorithms_and_Theory       0.74      0.56      0.64       374
                             Databases       0.85      0.84      0.85       241
            Encryption_and_Compression       0.44      0.50      0.47        32
             Hardware_and_Architecture       0.63      0.41      0.50        98
            Human_Computer_Interaction       0.72      0.50      0.59       114
                 Information_Retrieval       0.33      0.40      0.36        10
                            Networking       0.62      0.44      0.52       255
                     Operating_Systems       0.79      0.83      0.81       878
                           Programming       0.80      0.75      0.77       485

                             micro avg       0.76      0.76      0.76      3282
                             macro avg