In [1]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import pickle
import pdb

In [2]:
data = pickle.load(open('./data/restructured_dataset.pkl', 'rb'))

In [3]:
def process_label(label):
    if label[0] == '/':
        return label.split('/')[1]
    else:
        return label
def divide_to_test_and_train(features, labels, test_nodes=None, ids=None):
    if test_nodes is None:
        last_train_index = 80000
        features_train = features[:last_train_index]
        labels_train = labels[:last_train_index]
        features_test = features[last_train_index:]
        labels_test = labels[last_train_index:]
    else:
        assert(ids is not None)
        features_train = []
        labels_train = []
        features_test = []
        labels_test = []
        
        for i in range(len(features)):
            if (ids[i][0] in test_nodes) or (ids[i][1] in test_nodes):
                features_test.append(features[i])
                labels_test.append(labels[i])
            else:
                features_train.append(features[i])
                labels_train.append(labels[i])
    return features_train, labels_train, features_test, labels_test

In [4]:
def print_for_this_number_of_test_nodes(minimum_number_of_test_nodes_wanted):

    # another prototype
    features = []
    labels = []
    ids = []

    test_nodes = []
    treat_as_test_node = False
    for citing_id in data['edges'].keys():
        cited_ids = data['edges'][citing_id]
        if len(test_nodes) < minimum_number_of_test_nodes_wanted:
            treat_as_test_node = True
        if citing_id in data['features_for_id'].keys() and citing_id in data['label_for_id'].keys():
            for i in range(len(cited_ids)):
                if cited_ids[i] in data['features_for_id'].keys() and cited_ids[i] in data['label_for_id'].keys():
                    ids.append([citing_id, cited_ids[i]])
                    the_elements = [
                        data['features_for_id'][citing_id], data['features_for_id'][cited_ids[i]]
                    ]
                    the_labels = [
                        process_label(data['label_for_id'][citing_id]), process_label(data['label_for_id'][cited_ids[i]])
                    ]
                    if not (the_labels[0] == "NOLABEL" or the_labels[1] == "NOLABEL"):
                        features.append(the_elements)
                        labels.append(the_labels)
                        if treat_as_test_node:
                            test_nodes += [citing_id, cited_ids[i]]
                            test_nodes = list(set(test_nodes))
        treat_as_test_node = False

    features_train, labels_train, features_test, labels_test = divide_to_test_and_train(features, labels, test_nodes, ids)

    trainer = pycrfsuite.Trainer(verbose=False)

    for xseq, yseq in zip(features_train, labels_train):
        try:
            trainer.append(xseq, yseq)
        except:
            pdb.set_trace()

    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })


    trainer.train('cora%d.crfsuite' % minimum_number_of_test_nodes_wanted)

    tagger = pycrfsuite.Tagger()
    tagger.open('cora%d.crfsuite' % minimum_number_of_test_nodes_wanted)

    from collections import Counter
    info = tagger.info()

    def print_transitions(trans_features):
        for (label_from, label_to), weight in trans_features:
            print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

    print("Top likely transitions:")
    print_transitions(Counter(info.transitions).most_common(15))

    print("\nTop unlikely transitions:")
    print_transitions(Counter(info.transitions).most_common()[-15:])

    def full_classification_report(y_true, y_pred):
        lb = LabelBinarizer()
        y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
        y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = set(lb.classes_) - {'O'}
        tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        return classification_report(
            y_true_combined,
            y_pred_combined,
            labels = [class_indices[cls] for cls in tagset],
            target_names = tagset,
        )


    y_pred = [tagger.tag(xseq) for xseq in features_test]

    print(full_classification_report(labels_test, y_pred))

#### Test size: 50

In [6]:
print_for_this_number_of_test_nodes(50)

Top likely transitions:
Artificial_Intelligence -> Artificial_Intelligence 5.044041
Programming -> Programming 2.169742
Encryption_and_Compression -> Encryption_and_Compression 2.058263
Databases -> Databases 1.982924
Human_Computer_Interaction -> Human_Computer_Interaction 1.954938
Networking -> Networking 1.948717
Hardware_and_Architecture -> Hardware_and_Architecture 1.905854
Information_Retrieval -> Information_Retrieval 1.649868
Data_Structures__Algorithms_and_Theory -> Data_Structures__Algorithms_and_Theory 1.554492
Information_Retrieval -> Artificial_Intelligence 1.344639
Artificial_Intelligence -> Information_Retrieval 1.297929
Operating_Systems -> Operating_Systems 1.279639
Data_Structures__Algorithms_and_Theory -> Artificial_Intelligence 1.243754
Human_Computer_Interaction -> Artificial_Intelligence 1.114837
Programming -> Artificial_Intelligence 1.101798

Top unlikely transitions:
Databases -> Networking -1.075442
Information_Retrieval -> Programming -1.198018
Encryption_and

#### Test size: 100

In [7]:
print_for_this_number_of_test_nodes(100)

Top likely transitions:
Artificial_Intelligence -> Artificial_Intelligence 4.801341
Programming -> Programming 2.313652
Hardware_and_Architecture -> Hardware_and_Architecture 2.099289
Encryption_and_Compression -> Encryption_and_Compression 2.080741
Networking -> Networking 2.076263
Data_Structures__Algorithms_and_Theory -> Data_Structures__Algorithms_and_Theory 1.884011
Information_Retrieval -> Information_Retrieval 1.868383
Human_Computer_Interaction -> Human_Computer_Interaction 1.846793
Databases -> Databases 1.758569
Information_Retrieval -> Artificial_Intelligence 1.340177
Data_Structures__Algorithms_and_Theory -> Artificial_Intelligence 1.300631
Artificial_Intelligence -> Information_Retrieval 1.240355
Artificial_Intelligence -> Data_Structures__Algorithms_and_Theory 1.141417
Artificial_Intelligence -> Programming 1.060729
Programming -> Artificial_Intelligence 1.057742

Top unlikely transitions:
Networking -> Databases -1.138574
Databases -> Networking -1.171651
Encryption_and_

#### Test size: 20000

See the f1 drop?

In [9]:
print_for_this_number_of_test_nodes(20000)

Top likely transitions:
Artificial_Intelligence -> Artificial_Intelligence 3.957683
Human_Computer_Interaction -> Human_Computer_Interaction 3.277456
Information_Retrieval -> Information_Retrieval 2.982195
Encryption_and_Compression -> Encryption_and_Compression 2.958465
Programming -> Programming 2.702600
Hardware_and_Architecture -> Hardware_and_Architecture 2.626303
Databases -> Databases 2.491668
Data_Structures__Algorithms_and_Theory -> Data_Structures__Algorithms_and_Theory 2.110655
Networking -> Networking 1.785344
Operating_Systems -> Operating_Systems 1.639267
Artificial_Intelligence -> Information_Retrieval 1.242835
Information_Retrieval -> Artificial_Intelligence 0.880461
Human_Computer_Interaction -> Artificial_Intelligence 0.665403
Operating_Systems -> Programming 0.603918
Artificial_Intelligence -> Programming 0.602625

Top unlikely transitions:
Databases -> Networking -0.994284
Encryption_and_Compression -> Hardware_and_Architecture -1.083578
Hardware_and_Architecture ->