### SemEval2019 Hyperpartisan News Detection
#### Using GloVe as document representation

In [16]:
from lxml.etree import iterparse
import xml

import numpy as np
import random
import os
random_seed = 42
seed_value = random_seed

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED'] = str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

import pandas as pd
import pickle

import sys
sys.path.append('/home/ruan/Documentos/git/tcc-ii-ir-features-text-mining/tool-testing/corpus_2_solution_2/')

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score

from sklearn import preprocessing

from utils import *

import nltk
# nltk.download('punkt')

In [17]:
import inspect
import datetime
import logging
import pickle
import time
root_path = "/home/ruan/Documentos/git/tcc-ii-ir-features-text-mining/tool-testing/"
sys.path.insert(0, root_path)
from indextoolmanager import IndexToolManager

variable_name = 'TRAIN_CLF'
datestr = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# set up logging to file - see previous section for more details
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    filename=f'{root_path}logs/C2-S2-{datestr}-{variable_name}_debug.log',
                    filemode='w')
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
console.setLevel(logging.INFO)
# set a format which is simpler for console use
formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
# tell the handler to use this format
console.setFormatter(formatter)
# add the handler to the root logger
logging.getLogger(variable_name).addHandler(console)

In [18]:
mylogger = logging.getLogger(variable_name)
handler1 = logging.FileHandler(f'{root_path}logs/{datestr}-{variable_name}.log')
handler1.setLevel(logging.INFO)
formatter = logging.Formatter(
    '%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

handler1.setFormatter(formatter)
mylogger.addHandler(handler1)

add_ir_variables = True
ignore_first_result = True
exp_id = str(datetime.datetime.now())
tool = 'zettair'
ir_top_k = 100
hyperpartisan_db_name = 'hyperpartisan_split_42_bulk'
hyperpartisan_orig_db_name = 'hyperpartisan_bulk'

In [19]:
exp_dict = {
    'exp_id': exp_id,
    'tool': tool,
    'db': 'hyperpartisan',
    'db_name': hyperpartisan_db_name,
    'add_ir_variables': add_ir_variables,
    'solution_number': '2',
    'solution_name': '4_tom',
    'ir_top_k': ir_top_k,
    # 'train_input': '',
    # 'train_epochs': '',
    'ignore_first_result': ignore_first_result,
    'random_seed': random_seed,
    'classifier': '',
    'svc_C': '',
    'svc_max_iter': '',
}
testTool = IndexToolManager(
    indexName=str(hyperpartisan_db_name), top_k=exp_dict['ir_top_k'])
testTool = IndexToolManager(
    indexName=str(hyperpartisan_db_name), top_k=exp_dict['ir_top_k'])

In [20]:
def readFiles(textFile, labelFile):
    X,y = [], []    
    with open(labelFile) as labelFile:
        xml.sax.parse(labelFile, GroundTruthHandler(y))       
    for event, elem in iterparse(textFile):
        if elem.tag == "article":
            title = elem.attrib['title']
            text = "".join(elem.itertext())
            title = cleanQuotations(title)
            text = cleanQuotations(text)
            text = cleanText(fixup(text))
            text = ' '.join(text.split()[:1000])
            X.append(title + ". " + text)
            elem.clear()
    return np.asarray(X), np.asarray(y)

In [21]:
def read_glove(path, dim):
    '''
    read the glove vectors from path with dimension dim
    '''
    df = pd.read_csv(path + 'glove.6B.' + str(dim) + 'd.txt', sep=" ", quoting=3, header=None, index_col=0)
    glove = {key: val.values for key, val in df.T.items()}
    return glove

In [22]:
# set path for data
dataPath = root_path + 'db_hyperpartisan/'
pretrained_wv_path = root_path + "corpus_2_solution_2/pretrained_wv/"

textFile = dataPath + 'articles-training-byarticle.xml'

labelFile = dataPath + "ground-truth-training-byarticle.xml"

# read in data and glove vectors
texts, labels = readFiles(textFile, labelFile)

glove = read_glove(pretrained_wv_path, 300)

# split the samples with the same seed to compare results with other methods
# id1, id2 = fixedTestSplit(labels)
id1, id2 = fixedTestSplit42(labels)

In [23]:
def gloveVectorize(glove, text):
    '''
    Find the pretrained glove vectors of the first 1000 words in the articles.
    The final vector is the average of the vectors
    '''
    ir_variables_of_this_author, time_query_list = [], []
    dim = len(glove["the"])
    if (exp_dict['add_ir_variables']):
        X = np.zeros( (len(text), dim+6) )
    else:
        X = np.zeros( (len(text), dim) )
    for text_id, t in enumerate(text):
        tmp = np.zeros((1, dim))
        # tokenize and remove stopwords
        words = customTokenize(t, rm_stopwords=True)
        words = [w for w in words if w in glove.keys()]
        for word in words:
            tmp[:] += glove[word]
        ir_variables = {}
        ir_vars_dict = []
        if (exp_dict['add_ir_variables']):
            initial = None
            final = None
            initial = time.time()
            ign_first = ignore_first_result and (text_id in id1)
            if (exp_dict['tool'] == 'arango'):
                ir_variables = testTool.arango_get_IR_variables(
                    t, 'true', ignore_first_result=ign_first)
            elif (exp_dict['tool'] == 'elastic'):
                ir_variables = testTool.elastic_get_IR_variables(
                    t, 'true', ignore_first_result=ign_first)
            elif (exp_dict['tool'] == 'zettair'):
                ir_variables = testTool.zettair_get_IR_variables(
                    t, 'true', False, ignore_first_result=ign_first)
            final = time.time()
            time_query_list.append(float(final-initial))
            ir_vars_dict = [ir_variables['CLASS_0_BM25_AVG'], ir_variables['CLASS_0_BM25_COUNT'],
            ir_variables['CLASS_0_BM25_SUM'], ir_variables['CLASS_1_BM25_AVG'],
            ir_variables['CLASS_1_BM25_COUNT'], ir_variables['CLASS_1_BM25_SUM'],
]
        X[text_id, :] = np.concatenate((tmp/len(words), ir_vars_dict), axis=None)

    result_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f")
    time_query = np.mean(time_query_list)
    testTool.log_result(result_id, {
        'variable': 'TIME_QUERY',
        ** exp_dict,
        ** testTool.get_parameters(),
        'execution_type': 'training_test',
        'number_queries': str(len(time_query_list)),
        'value': str(time_query),
    })

    return X

In [24]:
glove_texts = gloveVectorize(glove, texts)

train_x = glove_texts[id1]
test_x = glove_texts[id2]
print('Train shape: ', train_x.shape)
print('Test shape: ', test_x.shape)

Train shape:  (430, 306)
Test shape:  (215, 306)


In [25]:
train_x[1:4, -6:]

array([[  6.67843345,  11.        ,  73.462768  ,   6.54476044,
          9.        ,  58.902844  ],
       [  9.17227862,  13.        , 119.239622  ,   7.926285  ,
          7.        ,  55.483995  ],
       [ 10.40865945,  11.        , 114.495254  ,   9.69551867,
          9.        ,  87.259668  ]])

In [26]:
import warnings
# warnings.filterwarnings("ignore", category = ignoreConvergenceWarning)
warnings.filterwarnings('ignore', 'Solver terminated early.*')
C = [0.1, 0.5, 0.6, 0.7, 0.9, 1, 1.1, 1.2, 1.5, 1.7, 1.8, 1.9, 2.2, 2.5, 3.5, 4.5, 5,10,20]
for c in C:
    kernel_svm = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(C=c, gamma="auto", max_iter = 5000))
    ])
    svm2 = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(C=c, gamma="auto", max_iter = 5000))
    ])
    svm2.fit(train_x, labels[id1])
    trn_pred = svm2.predict(train_x)
    tst_pred = svm2.predict(test_x)
    cm = confusion_matrix(labels[id2], tst_pred)
    tn, fp, fn, tp = cm.ravel()
    f1 = 2.0*tp/(2.0*tp + fp + fn)
    print("[KernelSVM] C=%f | acc=%f | acc_test=%f | f1_test=%f" %(c,
        np.mean(cross_val_score(kernel_svm, train_x, labels[id1], cv=10)),
        accuracy_score(labels[id2], tst_pred),
        f1))

[KernelSVM] C=0.100000 | acc=0.672093 | acc_test=0.683721 | f1_test=0.244444
[KernelSVM] C=0.500000 | acc=0.788372 | acc_test=0.813953 | f1_test=0.692308
[KernelSVM] C=0.600000 | acc=0.790698 | acc_test=0.813953 | f1_test=0.692308
[KernelSVM] C=0.700000 | acc=0.797674 | acc_test=0.818605 | f1_test=0.702290
[KernelSVM] C=0.900000 | acc=0.797674 | acc_test=0.827907 | f1_test=0.729927
[KernelSVM] C=1.000000 | acc=0.802326 | acc_test=0.823256 | f1_test=0.724638
[KernelSVM] C=1.100000 | acc=0.800000 | acc_test=0.827907 | f1_test=0.733813
[KernelSVM] C=1.200000 | acc=0.800000 | acc_test=0.832558 | f1_test=0.742857
[KernelSVM] C=1.500000 | acc=0.797674 | acc_test=0.823256 | f1_test=0.736111
[KernelSVM] C=1.700000 | acc=0.795349 | acc_test=0.818605 | f1_test=0.731034
[KernelSVM] C=1.800000 | acc=0.793023 | acc_test=0.818605 | f1_test=0.731034
[KernelSVM] C=1.900000 | acc=0.790698 | acc_test=0.809302 | f1_test=0.721088
[KernelSVM] C=2.200000 | acc=0.786047 | acc_test=0.809302 | f1_test=0.721088

In [27]:
C = [0.00001, 0.001, 0.01, 0.1, 0.5, 0.8, 0.9, 1, 2, 3, 5]
for c in C:
    lr = LogisticRegression(solver = 'lbfgs', C = c, max_iter=10000)

    lr2 = LogisticRegression(solver = 'lbfgs', C = c, max_iter=10000)
    lr2.fit(train_x, labels[id1])
    trn_pred = lr2.predict(train_x)
    tst_pred = lr2.predict(test_x)
    cm = confusion_matrix(labels[id2], tst_pred)
    tn, fp, fn, tp = cm.ravel()
    f1 = 2.0*tp/(2.0*tp + fp + fn)
    print("[LogisticR] C=%f | acc=%f | acc_test=%f | f1_test=%f" %(c,
        np.mean(cross_val_score(lr, train_x, labels[id1], cv=10)), 
        accuracy_score(labels[id2], tst_pred),
        f1))

[LogisticR] C=0.000010 | acc=0.681395 | acc_test=0.674419 | f1_test=0.375000
[LogisticR] C=0.001000 | acc=0.686047 | acc_test=0.693023 | f1_test=0.541667
[LogisticR] C=0.010000 | acc=0.695349 | acc_test=0.697674 | f1_test=0.532374
[LogisticR] C=0.100000 | acc=0.713953 | acc_test=0.716279 | f1_test=0.541353
[LogisticR] C=0.500000 | acc=0.725581 | acc_test=0.767442 | f1_test=0.652778
[LogisticR] C=0.800000 | acc=0.732558 | acc_test=0.753488 | f1_test=0.639456
[LogisticR] C=0.900000 | acc=0.732558 | acc_test=0.753488 | f1_test=0.639456
[LogisticR] C=1.000000 | acc=0.734884 | acc_test=0.758140 | f1_test=0.648649
[LogisticR] C=2.000000 | acc=0.730233 | acc_test=0.781395 | f1_test=0.684564
[LogisticR] C=3.000000 | acc=0.725581 | acc_test=0.772093 | f1_test=0.675497
[LogisticR] C=5.000000 | acc=0.737209 | acc_test=0.786047 | f1_test=0.701299


In [28]:
model = Pipeline([
            ("scaler", StandardScaler()),
            ("svc", SVC(C=0.9, gamma="auto", max_iter = 5000))
        ])
# model = LogisticRegression(solver = 'lbfgs', C = 1, max_iter=1000)
model.fit(train_x, labels[id1])
trn_pred = model.predict(train_x)
tst_pred = model.predict(test_x)
print('Train accuracy: ', accuracy_score(labels[id1], trn_pred))
print('Test accuracy: ', accuracy_score(labels[id2], tst_pred))
print('Test precision: ', precision_score(labels[id2], tst_pred, pos_label='true'))
print('Test recall: ', recall_score(labels[id2], tst_pred, pos_label='true'))

Train accuracy:  0.9209302325581395
Test accuracy:  0.827906976744186
Test precision:  0.847457627118644
Test recall:  0.6410256410256411


In [29]:
def log_resultt():
    global exp_dict
    result_info = {
        ** exp_dict,
        'execution_type': 'training_test',
    }
    result_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f")
    testTool.log_result(result_id, {
        'variable': 'CLF_ACC',
        ** testTool.get_parameters(),
        ** result_info,
        'value': result_info['accuracy'],
    })
    result_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f")
    testTool.log_result(result_id, {
        'variable': 'CLF_F1',
        ** testTool.get_parameters(),
        ** result_info,
        'value': result_info['f1'],
    })

In [30]:
import warnings
# warnings.filterwarnings("ignore", category = ignoreConvergenceWarning)
warnings.filterwarnings('ignore', 'F-score is.*')
lb = preprocessing.LabelBinarizer()
x_labels = lb.fit_transform(labels[id1]).ravel()
C = [0.00001, 0.001, 0.002, 0.005, 0.01, 0.05] + np.around(np.linspace(0.1,10,100,endpoint=True),2).tolist() + [100.0, 10000.0, 99999999.0]
# C = [0.9]
for c in C:
    global exp_dict
    exp_dict['classifier'] = 'SVC'
    exp_dict['svc_C'] = c
    exp_dict['svc_max_iter'] = 5000
    model = Pipeline([
                ("scaler", StandardScaler()),
                ("svc", SVC(C=c, gamma="auto", max_iter = exp_dict['svc_max_iter']))
            ])
    model.fit(train_x, labels[id1])
    trn_pred = model.predict(train_x)
    tst_pred = model.predict(test_x)
    exp_dict['acc_train'] = float(accuracy_score(labels[id1], trn_pred))
    cm = confusion_matrix(labels[id2], tst_pred)
    tn, fp, fn, tp = cm.ravel()    
    exp_dict['TN'] = int(tn)
    exp_dict['FP'] = int(fp)
    exp_dict['FN'] = int(fn)
    exp_dict['TP'] = int(tp)
    exp_dict['accuracy'] = float(np.mean(tst_pred == labels[id2]))
    exp_dict['f1'] = float(2.0*tp/(2.0*tp + fp + fn))

    exp_dict['acc_cv_10'] = float(np.mean(cross_val_score(model, train_x, x_labels, cv=10, scoring='accuracy')))
    exp_dict['f1_cv_10'] = float(np.mean(cross_val_score(model, train_x, x_labels, cv=10, scoring='f1')))
    mylogger.info(f'[SVC] C={c} | acc_train={exp_dict["acc_train"]} | acc_cv_10={exp_dict["acc_cv_10"]} | f1_cv_10={exp_dict["f1_cv_10"]} | acc_test={exp_dict["accuracy"]} | f1_test={exp_dict["f1"]}')
    # | tp={tp} | fn={fn} | fp={fp} | tn={tn}
    log_resultt()

 | acc_cv_10=0.7837209302325581 | f1_cv_10=0.7011319009351137 | acc_test=0.813953488372093 | f1_test=0.7297297297297297
TRAIN_CLF   : INFO     [SVC] C=4.3 | acc_train=1.0 | acc_cv_10=0.7837209302325581 | f1_cv_10=0.7011319009351137 | acc_test=0.813953488372093 | f1_test=0.7297297297297297
TRAIN_CLF   : INFO     [SVC] C=4.4 | acc_train=1.0 | acc_cv_10=0.7837209302325581 | f1_cv_10=0.7011319009351137 | acc_test=0.813953488372093 | f1_test=0.7297297297297297
TRAIN_CLF   : INFO     [SVC] C=4.4 | acc_train=1.0 | acc_cv_10=0.7837209302325581 | f1_cv_10=0.7011319009351137 | acc_test=0.813953488372093 | f1_test=0.7297297297297297
TRAIN_CLF   : INFO     [SVC] C=4.5 | acc_train=1.0 | acc_cv_10=0.786046511627907 | f1_cv_10=0.7058117038907789 | acc_test=0.813953488372093 | f1_test=0.7297297297297297
TRAIN_CLF   : INFO     [SVC] C=4.5 | acc_train=1.0 | acc_cv_10=0.786046511627907 | f1_cv_10=0.7058117038907789 | acc_test=0.813953488372093 | f1_test=0.7297297297297297
TRAIN_CLF   : INFO     [SVC] C=4

In [16]:
    # mylogger.info(f'n_right_predictions: {tp+tn}')
    # mylogger.info(f'Accuracy: {accuracy}')
    # mylogger.info(f'TP: {tp} \tFN: {fn}')
    # mylogger.info(f'FP: {fp} \tTN: {tn}')
    # mylogger.info(f'F1: {f1}')

In [0]:
# result_info = {
#     'db': exp_dict['db'],
#     'tool': exp_dict['tool'],
#     'db_name': exp_dict['db_name'],
#     'add_ir_variables': 'true' if add_ir_variables else 'false',
#     'solution_number': exp_dict['solution_number'],
#     'solution_name': exp_dict['solution_name'],
#     'TP': str(tp),
#     'FP': str(fp),
#     'TN': str(tn),
#     'FN': str(fn),
#     'accuracy': str(accuracy),
#     'f1': str(f1),
# }
# result_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f")
# testTool.log_result(result_id, {
#     'exp_id': exp_id,
#     'variable': 'CLF_ACC',
#     ** testTool.get_parameters(),
#     ** result_info,
#     'value': str(accuracy),
# })
# result_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S.%f")
# testTool.log_result(result_id, {
#     'exp_id': exp_id,
#     'variable': 'CLF_F1',
#     ** testTool.get_parameters(),
#     ** result_info,
#     'value': str(f1),
# })

In [0]:
# # fit the model to all samples
# model.fit(glove_texts, labels)
# # save the model
# pickle.dump(model, open(root_path + 'corpus_2_solution_2/trained_clsf/svm_glove_adapted.sav', 'wb'))

# # save the predictions
# np.save(root_path + "corpus_2_solution_2/predictions/glove_svm_adapted_pred", tst_pred)