This repository will contain preliminary research on the hyperpartisan news detection

https://www.aclweb.org/anthology/S19-2184/

Helper code: https://github.com/HLTCHKUST/hyperpartisan-news-detection/blob/master/utils/data_utils.py

In [2]:
import argparse
import pickle
from gensim.corpora import Dictionary
from data_utils import parse_xml, clean_txt
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import string
from bs4 import BeautifulSoup as Soup
import numpy as np
import pandas as pd

parse_xml is good enough to parse data from the xml files. But we need a different parser for the ground truths

In [3]:
def parse_xml_GT(article):
    
    # get meta data
    id = article.get('id')
    label = article.get('hyperpartisan')
    by = article.get('labeled-by')
    url = article.get('url')
    
    result = [id, {'hyperpartisan':label,'labeled-by':by, 'url':url}]
    return result    

In [4]:
def build_dataframe(data_path, label_path):
    
    ## For data:
    
    data_xml_file = open(data_path).read()
    data_soup = Soup(data_xml_file)
    data_articles=data_soup.find_all('article')
    #print("Number of articles: ", len(articles))
    
    data_preprocessed_dict={}
    for a in tqdm(data_articles):
        id, article = parse_xml(a)
        data_preprocessed_dict[id] = article
    
    ##For labels or ground truths
    GT_xml_file = open(label_path).read()
    GT_soup = Soup(GT_xml_file)
    GT_articles=GT_soup.find_all('article')
    #print("Number of articles: ", len(articles))
    
    GT_preprocessed_dict={}
    for a in tqdm(GT_articles):
        id, article = parse_xml_GT(a)
        GT_preprocessed_dict[id] = article
        
    
        
    df_data, df_labels = (pd.DataFrame.from_dict(data_preprocessed_dict).T, pd.DataFrame.from_dict(GT_preprocessed_dict).T)
    
    if(all(df_data.index==df_labels.index)):
        return pd.concat([df_data,df_labels], 1)
    else:
        print("check the data and labels for possibel mismatch")
        return "data and label mismatch"


In [5]:
data_path = "/disk2/sadat/FakeNewsData/Hyperpartisan_news_2019_semeval/articles-training-byarticle-20181122.xml"
label_path = "/disk2/sadat/FakeNewsData/Hyperpartisan_news_2019_semeval/ground-truth-training-byarticle-20181122.xml"

In [6]:
df_Train_byarticle = build_dataframe(data_path, label_path)

100%|██████████| 645/645 [00:00<00:00, 740.83it/s]
100%|██████████| 645/645 [00:00<00:00, 466274.75it/s]


In [7]:
df_Train_byarticle.head()

Unnamed: 0,date,title,internal,external,article_text,hyperpartisan,labeled-by,url
0,2017-09-10,Kucinich: Reclaiming the money power,4,[https://farm8.static.flickr.com/7020/65515348...,from flickr.com: money {mid numberplaceholder}...,True,article,https://www.opednews.com/articles/Kucinich-Rec...
1,2017-10-12,Trump Just Woke Up & Viciously Attacked Puerto...,0,[http://www.cnn.com/2017/03/16/politics/trump-...,donald trump ran on many braggadocios and larg...,True,article,http://bipartisanreport.com/2017/10/12/trump-j...
2,2017-10-11,"Liberals wailing about gun control, but what a...",0,[],photo by justin images in response to joyce ne...,True,article,https://www.reviewjournal.com/opinion/letters/...
3,2017-09-24,Laremy Tunsil joins NFL players in kneeling du...,0,[https://twitter.com/UncleChaps/status/9119271...,after colin kaepernick rightly chose to kneel ...,True,article,https://www.redcuprebellion.com/2017/9/24/1635...
4,2017-10-12,It's 1968 All Over Again,0,[http://www.nationalreview.com/redirect/amazon...,"almost a half century ago, in numberplaceholde...",False,article,https://www.realclearpolitics.com/articles/201...


In [8]:
df_Train_byarticle.hyperpartisan.value_counts()

false    407
true     238
Name: hyperpartisan, dtype: int64

In [9]:
df_Train_byarticle['label'] = df_Train_byarticle['hyperpartisan'].apply(lambda x:1 if x=='true' else 0)

In [10]:
import transformers as ppb

In [11]:
# Loading the pretrained BERT model
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [11]:
def create_padding_and_masking(df, col_name, maxlen=200, overlap=50):
    maximum_word_length = max([len(L.split()) for L in list(df_Train_byarticle.title)])
    if maximum_word_length<maxlen:
        tokenized = df[col_name].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
            # Padding: The token list have difference in size. So, let's put them in the same length
                #tokenized is Dataframe --the dataset that has the tokenized values
        max_len = 0 #for padding
        for i in tokenized.values:
          if len(i) > max_len:
              max_len = len(i)

        padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    
        attention_mask = np.where(padded != 0, 1, 0) #np.where(condition, value that will be true if condition is True, value returned if false)
        
    return padded, attention_mask


        
    

In [15]:
def make_counts(LIST, maxlen, overlap):
    
    '''With a given list of numbers, the maximum length it can be, and what is the overlap,
    The code converts them in sliced lists.
    Example: Z = make_counts([1,2,3,4,5,6,7,8,9,10], maxlen=4, overlap=2)
    Z = [[1, 2, 3, 4], [3, 4, 5, 6], [5, 6, 7, 8], [7, 8, 9, 10]]'''
    
    
    start = 0
    finish = start+maxlen
    listlen = len(LIST)
    idx = []
    while(start<listlen):

      if finish<listlen:
        idx.append([start, finish])
        start = finish-overlap
        finish = start+maxlen
      else:
        finish = listlen
        idx.append([start, finish])
        break
        
    sliced_list = []
    for i in idx:
        sliced_list.append(LIST[i[0]:i[1]])
        
        
    return sliced_list

In [16]:
def create_padding_and_masking_v2(df, col_name, maxlen=200, overlap=50):
    
    '''If the length of the tokenized values are more than 200, https://arxiv.org/abs/1910.10781 this paper suggests
    to break it down to 200 lengthed tokens'''
    
    df1 = pd.DataFrame()
    df1["tokenized"] = df[col_name].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
        # Padding: The token list have difference in size. So, let's put them in the same length
            #tokenized is Dataframe --the dataset that has the tokenized values
    
    
    
    df1["splitted_tokens"] = df1["tokenized"].apply(lambda x:make_counts(x, maxlen, overlap))
    df1["count"] = df1["splitted_tokens"].apply(lambda x:len(x))
    df1["padded_tokens"] = df1["splitted_tokens"].apply(lambda x:[i + [0]*(maxlen-len(i)) for i in x])
    padded = np.array([item for sublist in list(df1["padded_tokens"]) for item in sublist])
    attention_mask = np.where(padded != 0, 1, 0)

    return df1, padded, attention_mask

## Let's work for the titles only

In [20]:
import time
import torch

In [17]:
df_title, padded_title, attention_mask_title = create_padding_and_masking_v2(df_Train_byarticle, "title")

In [19]:
padded_title.shape

(645, 200)

In [21]:
def find_last_hidden_state(padded, attention_mask):
    '''Given the padded value and the attention mask, the pre-trained BERT (distillBERT) model
    will give the required embedding '''
    
    start_time = time.time()
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    time_lapsed = time.time()-start_time
    print(time_lapsed)
    return last_hidden_states
    

In [22]:
last_hidden_states_title = find_last_hidden_state(padded_title, attention_mask_title)

35.6024956703186


In [23]:
features_title = last_hidden_states_title[0][:,0,:] # We are interested in the [cls] tokken only
features_title = features_title.numpy() # features_title is a tensor, we need to convert it to numpy
label = df_Train_byarticle.label.to_numpy() # And so is the labels

#### Run models for titles only

In [24]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import sklearn

In [49]:
kf = KFold(n_splits=10)

In [50]:
data = features_title

In [51]:
import warnings
warnings.filterwarnings('ignore')

In [52]:
score_lr = []
score_SVM = []
for train_index, test_index in kf.split(data):
    train_X = data[train_index]
    train_y = label[train_index]
    test_X = data[test_index]
    test_y = label[test_index]
    lr_clf = LogisticRegression()
    svm_clf = sklearn.svm.SVC()
    lr_clf.fit(train_X, train_y)
    svm_clf.fit(train_X, train_y)
    prediction_lr = lr_clf.predict(test_X)
    prediction_svm = svm_clf.predict(test_X)
    score_lr.append(sklearn.metrics.accuracy_score(test_y, prediction_lr))
    score_SVM.append(sklearn.metrics.accuracy_score(test_y, prediction_svm))

print("Average accuracy percentage for Log reg is "+str(sum(score_lr)/len(score_lr)))
print("Average accuracy percentage for SVM is "+str(sum(score_SVM)/len(score_SVM)))

Average accuracy percentage for Log reg is 0.688533653846154
Average accuracy percentage for SVM is 0.6905048076923077


## Let's work for the articles only

In [35]:
df_article, padded_article, attention_mask_article = create_padding_and_masking_v2(df_Train_byarticle, "article_text")

Token indices sequence length is longer than the specified maximum sequence length for this model (1833 > 512). Running this sequence through the model will result in indexing errors


In [37]:
padded_article.shape

(3352, 200)

In [39]:
last_hidden_states_articles = find_last_hidden_state(padded_article, attention_mask_article)
features_article = last_hidden_states_articles[0][:,0,:]
features_article = features_article.numpy()

158.5051052570343


In [41]:
def Averaging_long_text(L, features, feat_shape=768):
    '''This function will be useful for the long texts, over 200 tokens only. We will average them'''
    idx = []
    start = 0
    end = 0

    for i in L:
      end = start + i
      idx.append([start, end])
      start = end 
    feat_avg = np.zeros((len(idx), feat_shape))
    J = 0
    for index in idx:
        feat_avg[J, 0:feat_shape] = np.mean(features[index[0]:index[1],:], 0)
        J += 1
    return feat_avg

In [44]:
feat_avg = Averaging_long_text(list(df_article["count"]), features_article)

In [45]:
feat_avg.shape

(645, 768)

In [54]:
data = feat_avg

### Run model for article text only

In [55]:
score_lr = []
score_SVM = []
for train_index, test_index in kf.split(data):
    train_X = data[train_index]
    train_y = label[train_index]
    test_X = data[test_index]
    test_y = label[test_index]
    lr_clf = LogisticRegression()
    svm_clf = sklearn.svm.SVC()
    lr_clf.fit(train_X, train_y)
    svm_clf.fit(train_X, train_y)
    prediction_lr = lr_clf.predict(test_X)
    prediction_svm = svm_clf.predict(test_X)
    score_lr.append(sklearn.metrics.accuracy_score(test_y, prediction_lr))
    score_SVM.append(sklearn.metrics.accuracy_score(test_y, prediction_svm))

print("Average accuracy percentage for Log reg is "+str(sum(score_lr)/len(score_lr)))
print("Average accuracy percentage for SVM is "+str(sum(score_SVM)/len(score_SVM)))

Average accuracy percentage for Log reg is 0.8001923076923078
Average accuracy percentage for SVM is 0.8081009615384616


## Can we combine them? 

Now, we will explore the idea of combining the titles and texts

**Combine 1**: A simple feature concatention

In [56]:
# combining features_title and feat_avg
combined1 = np.concatenate((features_title, feat_avg), axis=1)

In [57]:
combined1.shape

(645, 1536)

In [58]:
data = combined1

In [59]:
score_lr = []
score_SVM = []
for train_index, test_index in kf.split(data):
    train_X = data[train_index]
    train_y = label[train_index]
    test_X = data[test_index]
    test_y = label[test_index]
    lr_clf = LogisticRegression()
    svm_clf = sklearn.svm.SVC()
    lr_clf.fit(train_X, train_y)
    svm_clf.fit(train_X, train_y)
    prediction_lr = lr_clf.predict(test_X)
    prediction_svm = svm_clf.predict(test_X)
    score_lr.append(sklearn.metrics.accuracy_score(test_y, prediction_lr))
    score_SVM.append(sklearn.metrics.accuracy_score(test_y, prediction_svm))

print("Average accuracy percentage for Log reg is "+str(sum(score_lr)/len(score_lr)))
print("Average accuracy percentage for SVM is "+str(sum(score_SVM)/len(score_SVM)))

Average accuracy percentage for Log reg is 0.8034375
Average accuracy percentage for SVM is 0.8080528846153847


**So, not much improvement**

**Combine2:** Let's average the output probability score and produce output based on that:

In [62]:
train_X.shape

(581, 1536)

In [63]:
train_X[train_index,0:768].shape

(581, 768)

In [74]:
score_lr = []
score_SVM = []
for train_index, test_index in kf.split(data):
    
    train_X_title = data[train_index, 0:768]
    train_y = label[train_index]
    test_X_title = data[test_index, 0:768]
    test_y = label[test_index]
    
    train_X_article = data[train_index, 768:1536]
    test_X_article = data[test_index, 768:1536]
    
    lr_clf_title = LogisticRegression()
    lr_clf_title.fit(train_X_title, train_y)
    prediction_prob_lr_title = lr_clf_title.predict_proba(test_X_title)

    lr_clf_article = LogisticRegression()
    lr_clf_article.fit(train_X_article, train_y)
    prediction_prob_lr_article = lr_clf_article.predict_proba(test_X_article)    
    
    sum_pred_prob = prediction_prob_lr_title + prediction_prob_lr_article
    prediction_lr = np.where(sum_pred_prob[:,1]>1, 1, 0)
    score_lr.append(sklearn.metrics.accuracy_score(test_y, prediction_lr))

print("Average accuracy percentage for Log reg is "+str(sum(score_lr)/len(score_lr)))
#print("Average accuracy percentage for SVM is "+str(sum(score_SVM)/len(score_SVM)))

Average accuracy percentage for Log reg is 0.7957451923076924


# No, the scores didnot improve