In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import html
import re
import string
import random
from collections import Counter
import math
from score import *
from CountFeatureGenerator import *
from TfidfFeatureGenerator import *
from SvdFeatureGenerator import *
from Word2VecFeatureGenerator import *
from SentimentFeatureGenerator import *
import xgboost as xgb
import lightgbm as lgb



In [2]:
import baseline.utils as base
from baseline.utils import dataset, generate_test_splits, score
from nltk.corpus import stopwords

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [17]:
params_xgb = {
    'max_depth': 6,
    'colsample_bytree':0.6,
    'subsample': 1.0,
    'eta' : 0.1,
    'silent': 1,
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'num_class' : 4
}

In [18]:
num_rounds = 1000

In [19]:
def build_data():
    
    # create target variable
    body = pd.read_csv("fnc-1/train_bodies.csv")
    stances = pd.read_csv("fnc-1/train_stances.csv")
    data = pd.merge(stances, body, how='left', on='Body ID')
    targets = ['agree', 'disagree', 'discuss', 'unrelated']
    targets_dict = dict(zip(targets, range(len(targets))))
    data['target'] = list(map(lambda x: targets_dict[x], data['Stance']))
    
    data_y = data['target'].values

    # read features
    generators = [
                  CountFeatureGenerator(),
                  TfidfFeatureGenerator(),
                  SvdFeatureGenerator(),
                  Word2VecFeatureGenerator(),
                  SentimentFeatureGenerator()
                  #AlignmentFeatureGenerator()
                 ]
    features = [f for g in generators for f in g.read('train')]
    #print ((features))
    data_x = np.hstack(features)
    '''
    generators = [
                  #CountFeatureGenerator(),
                  #TfidfFeatureGenerator(),
                  SvdFeatureGenerator(),
                  #Word2VecFeatureGenerator(),
                  #SentimentFeatureGenerator()
                  #AlignmentFeatureGenerator()
                 ]
    for g in generators:
        for f in g.read('train'):
            data_x = np.hstack((data_x, f))
    generators = [
                  #CountFeatureGenerator(),
                  #TfidfFeatureGenerator(),
                  #SvdFeatureGenerator(),
                  Word2VecFeatureGenerator(),
                  #SentimentFeatureGenerator()
                  #AlignmentFeatureGenerator()
                 ]
    for g in generators:
        for f in g.read('train'):
            data_x = np.hstack((data_x, f))
            
    generators = [
                  #CountFeatureGenerator(),
                  #TfidfFeatureGenerator(),
                  #SvdFeatureGenerator(),
                  #Word2VecFeatureGenerator(),
                  SentimentFeatureGenerator()
                  #AlignmentFeatureGenerator()
                 ]
    for g in generators:
        for f in g.read('train'):
            data_x = np.hstack((data_x, f))
            
  '''
   # print(data_x.shape)
    print(data_x[0,:])
    print('data_x.shape')
    print(data_x.shape)
    print('data_y.shape')
    print(data_y.shape)
    print('body_ids.shape')
    print(data['Body ID'].values.shape)
    
    #with open('data_new.pkl', 'wb') as outfile:
    #    cPickle.dump(data_x, outfile, -1)
    #    print 'data saved in data_new.pkl'

    return data_x, data_y, data['Body ID'].values


In [20]:
def perfect_score(truth_y):
    
    score = 0
    for i in range(truth_y.shape[0]):
        if truth_y[i] == 3: score += 0.25
        else: score += 1

    return score

In [21]:
def eval_metric(yhat, dtrain):
    y = dtrain.get_label()
    yhat = np.argmax(yhat, axis =1 )
    predicted = [LABELS[int(a)] for a in yhat]
    actual = [LABELS[int(a)] for a in y]
    s, _ = score_submission(actual, predicted)
    s_perf, _ = score_submission(actual, actual)
    score = float(s) / s_perf
    return 'score', score

In [22]:
def build_test_data():
    body = pd.read_csv('fnc-1/test_bodies.csv')
    stances = pd.read_csv('fnc-1/test_stances.csv')
    data = pd.merge(stances, body, how='left', on='Body ID')
    generators = [
                  CountFeatureGenerator(),
                  TfidfFeatureGenerator(),
                  SvdFeatureGenerator(),
                  Word2VecFeatureGenerator(),
                  SentimentFeatureGenerator()
                  #AlignmentFeatureGenerator()
                 ]
    features = [f for g in generators for f in g.read('test')]
    #print ((features))
    data_x = np.hstack(features)
    '''
    generators = [
                  #CountFeatureGenerator(),
                  #TfidfFeatureGenerator(),
                  SvdFeatureGenerator(),
                  #Word2VecFeatureGenerator(),
                  #SentimentFeatureGenerator()
                  #AlignmentFeatureGenerator()
                 ]
    for g in generators:
        for f in g.read('test'):
            data_x = np.hstack((data_x, f))
    generators = [
                  #CountFeatureGenerator(),
                  #TfidfFeatureGenerator(),
                  #SvdFeatureGenerator(),
                  Word2VecFeatureGenerator(),
                  #SentimentFeatureGenerator()
                  #AlignmentFeatureGenerator()
                 ]
    for g in generators:
        for f in g.read('test'):
            data_x = np.hstack((data_x, f))
            
    generators = [
                  #CountFeatureGenerator(),
                  #TfidfFeatureGenerator(),
                  #SvdFeatureGenerator(),
                  #Word2VecFeatureGenerator(),
                  SentimentFeatureGenerator()
                  #AlignmentFeatureGenerator()
                 ]
    for g in generators:
        for f in g.read('test'):
            data_x = np.hstack((data_x, f))
    '''
    print(data_x[0, :])
    print('test data_x.shape')
    print(data_x.shape)
    print('test body_ids.shape')
    print(data['Body ID'].values.shape)
    return data_x, data['Body ID'].values

In [12]:
    data_x, data_y, body_ids = build_data()
    test_x, body_ids_test = build_test_data()
    
    w = np.array([1 if y == 3 else 4 for y in data_y])
    print('w:')
    print(w)
    print(np.mean(w))
    
    n_iters = 1200
    print('perfect score: ', perfect_score(data_y))
    print(Counter(data_y))
    
    dtrain = xgb.DMatrix(data_x, label=data_y, weight=w)
    dtest = xgb.DMatrix(test_x)
    watchlist = [(dtrain, 'train')]
    
    bst = xgb.train(params_xgb, dtrain, n_iters, watchlist, feval = eval_metric, verbose_eval=10)
    
    pred_prob_y = bst.predict(dtest).reshape(test_x.shape[0], 4)
    pred_y = np.argmax(pred_prob_y, axis = 1)
    print('pred_y.shape:')
    print(pred_y.shape)
    
   

feature names: 
['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 'count_of_Headline_unigram_in_articleBody', 'ratio_of_Headline_unigram_in_articleBody', 'count_of_Headline_bigram_in_articleBody', 'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 'false

xHeadlineTfidf.shape:
(25413, 769582)
xBodyTfidf.shape:
(25413, 769582)
simTfidf.shape:
(25413, 1)
xHeadlineSvd.shape:
(25413, 50)
xBodySvd.shape:
(25413, 50)
simSvd.shape:
(25413, 1)
headlineVec.shape:
(25413, 300)
bodyVec.shape:
(25413, 300)
simVec.shape:
(25413, 1)
headlineSenti.shape:
(25413, 4)
bodySenti.shape:
(25413, 4)
[  1.30000000e+01   1.30000000e+01   1.00000000e+00   1.20000000e+01
   1.20000000e+01   1.00000000e+00   1.10000000e+01   1.10000000e+01
   1.00000000e+00   1.92000000e+02   1.32000000e+02   6.87500000e-01
   1.91000000e+02   1.77000000e+02   9.26701571e-01   1.90000000e+02
   1.85000000e+02   9.73684211e-01   1.00000000e+00   7.69230769e-02
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   1.00000000e+00   1.60000000e+01   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+0

[0]	train-mlogloss:1.24975	train-score:0.893134
[10]	train-mlogloss:0.602034	train-score:0.922795
[20]	train-mlogloss:0.379256	train-score:0.936246
[30]	train-mlogloss:0.275641	train-score:0.949775
[40]	train-mlogloss:0.221083	train-score:0.959924
[50]	train-mlogloss:0.186205	train-score:0.96943
[60]	train-mlogloss:0.156866	train-score:0.977053
[70]	train-mlogloss:0.136257	train-score:0.982582
[80]	train-mlogloss:0.119207	train-score:0.987136
[90]	train-mlogloss:0.104772	train-score:0.989995
[100]	train-mlogloss:0.094151	train-score:0.992299
[110]	train-mlogloss:0.084379	train-score:0.994227
[120]	train-mlogloss:0.075685	train-score:0.9959
[130]	train-mlogloss:0.068107	train-score:0.997019
[140]	train-mlogloss:0.062168	train-score:0.997806
[150]	train-mlogloss:0.05684	train-score:0.99826
[160]	train-mlogloss:0.051146	train-score:0.998892
[170]	train-mlogloss:0.046782	train-score:0.999036
[180]	train-mlogloss:0.042453	train-score:0.999335
[190]	train-mlogloss:0.038811	train-score:0.9995

In [13]:
predicted = [LABELS[int(a)] for a in pred_y]
    #print predicted

    # save (id, predicted and probabilities) to csv, for model averaging
stances = pd.read_csv("fnc-1/test_stances_processed.csv") # same row order as predicted
    
df_output = pd.DataFrame()
df_output['Headline'] = stances['Headline']
df_output['Body ID'] = stances['Body ID']

print(len(predicted))
print(len(df_output))
df_output['Stance'] = predicted
df_output['prob_0'] = pred_prob_y[:, 0]
df_output['prob_1'] = pred_prob_y[:, 1]
df_output['prob_2'] = pred_prob_y[:, 2]
df_output['prob_3'] = pred_prob_y[:, 3]
    #df_output.to_csv('submission.csv', index=False)
df_output.to_csv('tree_pred_prob_cor2.csv', index=False)
df_output[['Headline','Body ID','Stance']].to_csv('tree_pred_cor2.csv', index=False)

print(df_output)
print(Counter(df_output['Stance']))

25413
25413
                                                Headline  Body ID     Stance  \
0      Ferguson riots: Pregnant woman loses eye after...     2008  unrelated   
1      Crazy Conservatives Are Sure a Gitmo Detainee ...     1550  unrelated   
2      A Russian Guy Says His Justin Bieber Ringtone ...        2  unrelated   
3      Zombie Cat: Buried Kitty Believed Dead, Meows ...     1793  unrelated   
4      Argentina's President Adopts Boy to End Werewo...       37  unrelated   
5         Next-generation Apple iPhones' features leaked     2353  unrelated   
6      Saudi national airline may introduce gender se...      192  unrelated   
7      'Zombie Cat' Claws Way Out Of Grave And Into O...     2482  unrelated   
8         ISIS might be harvesting organs, Iraq tells UN      250  unrelated   
9      Woman has surgery to get third breast: The thr...       85  unrelated   
10     EXCLUSIVE: Apple To Unveil The Long-Awaited Re...     1964    discuss   
11     50 foot crab : Is thi

In [None]:
train()

In [31]:
data_x, data_y, body_ids = build_test_data()

feature names: 
['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 'count_of_Headline_unigram_in_articleBody', 'ratio_of_Headline_unigram_in_articleBody', 'count_of_Headline_bigram_in_articleBody', 'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 'false

ValueError: not enough values to unpack (expected 3, got 2)

In [6]:
test_stances = pd.read_csv('fnc-1/test_stances.csv')
pred_test_stances = pd.read_csv('tree_pred_cor2.csv')

In [11]:
pred_test_stances['Stance'].head()

0    unrelated
1    unrelated
2    unrelated
3    unrelated
4    unrelated
Name: Stance, dtype: object

In [13]:
report_score(test_stances['Stance'], pred_test_stances['Stance'])

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    956    |     0     |    862    |    85     |
-------------------------------------------------------------
| disagree  |    193    |     0     |    416    |    88     |
-------------------------------------------------------------
|  discuss  |    658    |     0     |   3617    |    189    |
-------------------------------------------------------------
| unrelated |    22     |     0     |    183    |   18144   |
-------------------------------------------------------------
Score: 9641.25 out of 11651.25	(82.74863212101705%)


82.74863212101705

In [23]:
data_x, data_y, body_ids = build_data()
test_x, body_ids_test = build_test_data()

feature names: 
['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 'count_of_Headline_unigram_in_articleBody', 'ratio_of_Headline_unigram_in_articleBody', 'count_of_Headline_bigram_in_articleBody', 'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 'false

feature names: 
['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 'count_of_Headline_unigram_in_articleBody', 'ratio_of_Headline_unigram_in_articleBody', 'count_of_Headline_bigram_in_articleBody', 'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 'false

In [24]:
np.save('final_features', data_x)

In [25]:
np.save('final_test_features', test_x)

In [79]:
params_lgb = {
    'max_depth': 6,
    'colsample_bytree':0.6,
    'subsample': 1.0,
    'eta' : 0.1,
    'objective': 'softmax',
    'eval_metric': 'multi_logloss',
    'num_class' : 4
    }

In [80]:
    data_x, data_y, body_ids = build_data()
    test_x, body_ids_test = build_test_data()
    
    w = np.array([1 if y == 3 else 4 for y in data_y])
    print('w:')
    print(w)
    print(np.mean(w))
    
    n_iters = 500
    print('perfect score: ', perfect_score(data_y))
    print(Counter(data_y))
    d_train = lgb.Dataset(data_x, label=data_y, weight=w)
    d_test = lgb.Dataset(test_x)
    #dtrain = xgb.DMatrix(data_x, label=data_y, weight=w)
    #dtest = xgb.DMatrix(test_x)
    watchlist = [(d_train, 'train')]
    
    bst = lgb.train(params_lgb, d_train, n_iters, feval = eval_metric, verbose_eval=10)

feature names: 
['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 'count_of_Headline_unigram_in_articleBody', 'ratio_of_Headline_unigram_in_articleBody', 'count_of_Headline_bigram_in_articleBody', 'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 'false

feature names: 
['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 'count_of_Headline_unigram_in_articleBody', 'ratio_of_Headline_unigram_in_articleBody', 'count_of_Headline_bigram_in_articleBody', 'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 'false

In [81]:
    pred_prob_y = bst.predict(test_x).reshape(test_x.shape[0], 4)
    pred_y = np.argmax(pred_prob_y, axis = 1)
    print('pred_y.shape:')
    print(pred_y.shape)

pred_y.shape:
(25413,)


In [82]:
predicted = [LABELS[int(a)] for a in pred_y]
    #print predicted

    # save (id, predicted and probabilities) to csv, for model averaging
stances = pd.read_csv("fnc-1/test_stances_processed.csv") # same row order as predicted
    
df_output = pd.DataFrame()
df_output['Headline'] = stances['Headline']
df_output['Body ID'] = stances['Body ID']

print(len(predicted))
print(len(df_output))
df_output['Stance'] = predicted
df_output['prob_0'] = pred_prob_y[:, 0]
df_output['prob_1'] = pred_prob_y[:, 1]
df_output['prob_2'] = pred_prob_y[:, 2]
df_output['prob_3'] = pred_prob_y[:, 3]
    #df_output.to_csv('submission.csv', index=False)
#df_output.to_csv('tree_pred_prob_cor2.csv', index=False)
df_output[['Headline','Body ID','Stance']].to_csv('tree_pred_cor2.csv', index=False)

print(df_output)
print(Counter(df_output['Stance']))

25413
25413
                                                Headline  Body ID     Stance  \
0      Ferguson riots: Pregnant woman loses eye after...     2008  unrelated   
1      Crazy Conservatives Are Sure a Gitmo Detainee ...     1550  unrelated   
2      A Russian Guy Says His Justin Bieber Ringtone ...        2  unrelated   
3      Zombie Cat: Buried Kitty Believed Dead, Meows ...     1793  unrelated   
4      Argentina's President Adopts Boy to End Werewo...       37  unrelated   
5         Next-generation Apple iPhones' features leaked     2353  unrelated   
6      Saudi national airline may introduce gender se...      192  unrelated   
7      'Zombie Cat' Claws Way Out Of Grave And Into O...     2482  unrelated   
8         ISIS might be harvesting organs, Iraq tells UN      250  unrelated   
9      Woman has surgery to get third breast: The thr...       85  unrelated   
10     EXCLUSIVE: Apple To Unveil The Long-Awaited Re...     1964    discuss   
11     50 foot crab : Is thi

Counter({'unrelated': 18490, 'discuss': 5020, 'agree': 1899, 'disagree': 4})


In [83]:
report_score(test_stances['Stance'], predicted)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |   1030    |     0     |    789    |    84     |
-------------------------------------------------------------
| disagree  |    207    |     0     |    409    |    81     |
-------------------------------------------------------------
|  discuss  |    640    |     4     |   3636    |    184    |
-------------------------------------------------------------
| unrelated |    22     |     0     |    186    |   18141   |
-------------------------------------------------------------
Score: 9713.5 out of 11651.25	(83.36873725995065%)


83.36873725995065

In [46]:
def evaluate_answer_single(model, feats, truelabels,lenc):
    predicted = model.predict_proba(np.array(feats))
    predicted = np.argmax(predicted,axis =-1)
    ground = list()
    pred = list()
    t=np.argmax(truelabels,axis =-1)
    for i in predicted:
        pred.append(encodertest.inverse_transform(i))
    for i in t:
        ground.append(encodertest.inverse_transform(i))
    score.report_score(ground, pred)

In [48]:
body = pd.read_csv("fnc-1/train_bodies.csv")
stances = pd.read_csv("fnc-1/train_stances.csv")
data = pd.merge(stances, body, how='left', on='Body ID')

In [58]:
data_x, data_y, body_ids = build_data()

feature names: 
['count_of_Headline_unigram', 'count_of_unique_Headline_unigram', 'ratio_of_unique_Headline_unigram', 'count_of_Headline_bigram', 'count_of_unique_Headline_bigram', 'ratio_of_unique_Headline_bigram', 'count_of_Headline_trigram', 'count_of_unique_Headline_trigram', 'ratio_of_unique_Headline_trigram', 'count_of_articleBody_unigram', 'count_of_unique_articleBody_unigram', 'ratio_of_unique_articleBody_unigram', 'count_of_articleBody_bigram', 'count_of_unique_articleBody_bigram', 'ratio_of_unique_articleBody_bigram', 'count_of_articleBody_trigram', 'count_of_unique_articleBody_trigram', 'ratio_of_unique_articleBody_trigram', 'count_of_Headline_unigram_in_articleBody', 'ratio_of_Headline_unigram_in_articleBody', 'count_of_Headline_bigram_in_articleBody', 'ratio_of_Headline_bigram_in_articleBody', 'count_of_Headline_trigram_in_articleBody', 'ratio_of_Headline_trigram_in_articleBody', 'len_sent_Headline', 'len_sent_articleBody', 'fake_exist', 'fraud_exist', 'hoax_exist', 'false

In [65]:
agree_data_x = []
agree_data_y = []
for i in range(len(data_x)):
    if data_y[i] == 0 or data_y[i] == 1:
        agree_data_x.append(data_x[i])
        agree_data_y.append(data_y[i])

In [71]:
d_train = lgb.Dataset(agree_data_x, label=agree_data_y, weight=w2)

In [68]:
params_lgb = {
    'max_depth': 6,
    'colsample_bytree':0.6,
    'subsample': 1.0,
    'eta' : 0.1,
    'objective': 'binary',
    'eval_metric': 'binary_logloss',
    'num_class' : 2
    }

In [70]:
w2 = np.array([4 for y in agree_data_y])

In [72]:
    bst2 = lgb.train(params_lgb2, d_train, n_iters, feval = eval_metric, verbose_eval=10)

In [41]:
pred_prob_my = pd.read_csv('tree_pred_prob_cor2.csv')
pred_prob_sas = pd.read_csv('test_pred.csv', header=None, names = ['agree', 'disagree', 'discuss', 'unrelated'])

In [24]:
pred_prob_my.head()

Unnamed: 0,Headline,Body ID,Stance,prob_0,prob_1,prob_2,prob_3
0,Ferguson riots: Pregnant woman loses eye after...,2008,unrelated,2.380947e-06,9.390648e-08,5.488185e-07,0.999997
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,unrelated,1.741105e-07,4.764846e-09,3.5271e-06,0.999996
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,unrelated,1.57894e-07,9.579308e-09,1.011454e-06,0.999999
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,unrelated,2.187414e-08,1.879898e-08,1.518788e-07,1.0
4,Argentina's President Adopts Boy to End Werewo...,37,unrelated,9.459537e-07,1.632811e-07,2.303649e-07,0.999999


In [43]:
df_result1 = (pred_prob_my['prob_0'] + pred_prob_sas['agree']) / 2
df_result2 = (pred_prob_my['prob_1'] + pred_prob_sas['disagree']) / 2
df_result3 = (pred_prob_my['prob_2'] + pred_prob_sas['discuss']) / 2
df_result4 = (pred_prob_my['prob_3'] + pred_prob_sas['unrelated']) / 2

In [44]:
df_result_final = pd.DataFrame(df_result1, columns=['prob_0'])
df_result_final['disagree'] = df_result2
df_result_final['discuss'] = df_result3
df_result_final['unrelated'] = df_result4

In [36]:
pred_y2 = df_result_final.values

In [45]:
pred_ylabel = np.argmax(pred_y2, axis = 1)
predicted2 = [LABELS[int(a)] for a in pred_ylabel]
    #print predicted

    # save (id, predicted and probabilities) to csv, for model averaging
stances = pd.read_csv("fnc-1/test_stances_processed.csv")

In [46]:
report_score(stances['Stance'], predicted2)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |   1021    |     0     |    754    |    128    |
-------------------------------------------------------------
| disagree  |    208    |     1     |    370    |    118    |
-------------------------------------------------------------
|  discuss  |    433    |     0     |   3808    |    223    |
-------------------------------------------------------------
| unrelated |    17     |     0     |    197    |   18135   |
-------------------------------------------------------------
Score: 9805.0 out of 11651.25	(84.15406072309838%)


84.15406072309838