In [1]:
import sys
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission

from utils.system import parse_params, check_version


In [2]:
def generate_features(stances,dataset,name):
    h, b, y = [],[],[]

    for stance in stances:
        y.append(LABELS.index(stance['Stance']))
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])

    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X,y

In [3]:
#Load the training dataset and generate folds
d = DataSet()
folds,hold_out = kfold_split(d,n_folds=10)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

Reading dataset
Total stances: 49972
Total bodies: 1683


In [4]:
for fold in fold_stances:
    print(fold)

6
0
7
5
2
8
9
3
1
4


In [5]:
# Load the competition dataset
competition_dataset = DataSet("competition_test")
X_competition, y_competition = generate_features(competition_dataset.stances, competition_dataset, "competition")


Reading dataset
Total stances: 25413
Total bodies: 904


In [6]:
Xs = dict()
ys = dict()

# Load/Precompute all features now
X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout")
for fold in fold_stances:
    print(fold)
    Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold))

6
0
7
5
2
8
9
3
1
4


In [11]:
print(ids)


[0, 1, 2, 3, 5, 6, 7, 8, 9]


In [7]:
best_score = 0
best_fold = None


In [8]:
# Classifier for each fold
for fold in fold_stances:
    ids = list(range(len(folds)))
    del ids[fold]

    X_train = np.vstack(tuple([Xs[i] for i in ids]))
    y_train = np.hstack(tuple([ys[i] for i in ids]))

    X_test = Xs[fold]
    y_test = ys[fold]

    clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
    clf.fit(X_train, y_train)

    predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
    actual = [LABELS[int(a)] for a in y_test]

    fold_score, _ = score_submission(actual, predicted)
    max_fold_score, _ = score_submission(actual, actual)

    score = fold_score/max_fold_score

    print("Score for fold "+ str(fold) + " was - " + str(score))
    if score > best_score:
        best_score = score
        best_fold = clf

      Iter       Train Loss   Remaining Time 
         1       25162.6313            1.13m
         2       22847.0201            1.07m
         3       21134.1453            1.04m
         4       19790.3045            1.00m
         5       18746.5909           58.21s
         6       17851.8813           56.82s
         7       17131.5277           55.90s
         8       16518.7629           55.00s
         9       16038.1396           54.28s
        10       15574.3099           53.61s
        20       13390.3986           49.14s
        30       12636.8896           46.09s
        40       12235.7151           44.62s
        50       11995.7317           42.21s
        60       11803.6439           40.49s
        70       11661.5428           38.74s
        80       11542.9736           35.99s
        90       11438.9781           35.00s
       100       11349.8780           32.26s
       200       10659.7475            0.00s
Score for fold 6 was - 0.7700373455903476
      Iter  

         6       17902.0201           59.37s
         7       17168.4199           58.81s
         8       16557.2036           58.03s
         9       16064.7645           57.01s
        10       15611.9675           56.15s
        20       13424.9901           52.56s
        30       12672.2394           50.12s
        40       12263.1763           46.55s
        50       12007.7795           43.84s
        60       11823.6370           40.97s
        70       11680.8734           37.65s
        80       11561.5658           34.46s
        90       11459.9041           31.29s
       100       11373.6203           28.22s
       200       10712.2962            0.00s
Score for fold 1 was - 0.7927084788378265
      Iter       Train Loss   Remaining Time 
         1       25272.4807            1.09m
         2       22947.0423            1.11m
         3       21232.8755            1.26m
         4       19894.5762            1.21m
         5       18840.2178            1.26m
         6  

In [9]:
#Run on Holdout set and report the final score on the holdout set
predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
actual = [LABELS[int(a)] for a in y_holdout]
print(type(predicted))
print("Scores on the dev set")
report_score(actual,predicted)
print("")
print("")

#Run on competition dataset
predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
actual = [LABELS[int(a)] for a in y_competition]

print("Scores on the test set")
report_score(actual,predicted)

<class 'list'>
Scores on the dev set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    115    |     8     |    557    |    82     |
-------------------------------------------------------------
| disagree  |    16     |     3     |    128    |    15     |
-------------------------------------------------------------
|  discuss  |    61     |     3     |   1529    |    207    |
-------------------------------------------------------------
| unrelated |     5     |     1     |    96     |   6796    |
-------------------------------------------------------------
Score: 3539.25 out of 4448.5	(79.56052602000675%)


Scores on the test set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    167

75.0885098165433