In [2]:
#-*- coding:utf-8 -*-
from warcio.archiveiterator import ArchiveIterator
from urllib.error import URLError, HTTPError #For python2.7 uses urllib2, and for python3 uses urllib.error 
from csv import writer,QUOTE_ALL
from time import strftime, sleep
from operator import itemgetter
from bs4 import BeautifulSoup
from newspaper import Article
from feedparser import parse
from re import search, sub
from requests import get


In [3]:
''' Get info of comment's feed through RSS'''

def get_news_feed(dict_rss):
    url_rss = None
    feed_list = sorted(dict_rss.items(), key=itemgetter(1), reverse = True)
    
    if(len(feed_list) > 0):
        url_rss = feed_list[0][0]
    return url_rss

def get_webpage_rss(html_content):

    dict_rss = {}
    soup_original = BeautifulSoup(html_content, 'html.parser')      
    filter_area = soup_original.find_all('head')

    if(len(filter_area) > 0):
        filter_area = str(filter_area[0])
        soup_filter = BeautifulSoup(filter_area, 'html.parser')    
    
        for link in soup_filter.find_all('link'):
            if( (link.get('type') == 'application/rss+xml') and
                (link.get('href').find('comment') < 0) and
                (link.get('href').find('feed') > 0) ): 
                    print(link.get('href'))
                    dict_rss[link.get('href')] = len(link.get('href'))
    url_feed = get_news_feed(dict_rss)
    url_info = parse(url_feed)
    print(len(url_info.entries))

def verify_url_availability(url):
    ''' Verify if the URL is still online '''
    is_online = True
    try:
        request_result = get(url, timeout=10)
    except HTTPError as e:
        print('HTTPError: {}'.format(e.code))
        is_online = False
    except URLError as e:
        print('URLError: {}'.format(e.reason))
        is_online = False

    return is_online

In [17]:

####### identifying ads ####


def check_attributes(tag_list):
    count = 0
    popular_attr = ['ad', 'ads', 'adv', 'advert', 'advertisement', 'banner']
    for t in tag_list:
        for attr in popular_attr:
            match_id, match_class = None, None
            pattern = r'\s*'+attr+r'[-]'
            if(t.get('id') != None):
                match_id = search(pattern, r''+t.get('id'))
            if(t.get('class') != None):
                match_class = search(pattern, r' '.join(t.get('class')))
            if(match_id or match_class):
                count += 1
                break
    return count

def check_dimensions(tag_list):
    #width, height
    dimensions = [('300','250'), ('728','90'), ('160', '600'), ('250', '250'), ('240','400'),
                 ('336', '280'), ('180', '150'), ('468', '60'), ('234', '60'), ('88', '31'),
                 ('120', '90'), ('120', '60'), ('125', '125'), ('120', '240'), ('120', '600'),
                 ('300', '600')]
    count = 0
    for t in tag_list:
        for d in dimensions:
            if((t.get('width') != None and t.get('height') != None) and
               (t.get('width') == d[0] and t.get('height') == d[1])):
                count += 1
                break
    return count

def check_descendants_tree(tag_list):
    count = 0
    for t in tag_list:
        for child in t.descendants:
            if ((child != None) and (child.name != None)
                and child.name.startswith('script')):           
                count +=1
    return count

def count_advertisements(soup):
    count = 0
    iframe_taglist = soup.find_all('iframe')
    div_taglist    = soup.find_all('div')
    aside_taglist  = soup.find_all('aside')

    #Method 1: Adblock Filter Rules 
    #To DO

    #Method 2: Popular class/id tags' attributes
    count += check_attributes(iframe_taglist)
    count += check_attributes(div_taglist)    
    count += check_attributes(aside_taglist)

    #Method 3: Popular Ads dimensions
    count += check_dimensions(iframe_taglist)

    #Method 4: Check descendants
    count += check_descendants_tree(aside_taglist)

    return count

def count_ads(soup):
    #TO DO add others ads companies

    count = 0
    for t in soup.find_all('script'):
        if ((t.get('src') != None) and
            (t.get('src')[0].find('google') >= 0)):	
            count = count + 1
    
    #manual method to indentify ads (checking the html file structure)
    more_ads = count_advertisements(soup)
    
    count = count + more_ads
    return count

In [18]:
def clean_author_name(name):
    ''' Clean the html tag content to get the author name '''
   
    months = r'\b[jJ]anuary\b|\b[fF]ebruary\b|\b[mM]arch\b|\b[aA]pril\b|\b[mM]ay\b|\b[jJ]une\b|\b[jJ]uly\b|\b[aA]ugust\b|\b[sS]eptember\b|\b[oO]ctober\b|\b[nN]ovember\b|\b[dD]ecember\b'
    usual_words = r'[pP]osted|[wW]ritten|[pP]ublished'
    temp_words = r'\bam\b|\bpm\b|\b[oO]n\b|\b[iI]n\b|\b[aA]t\b'
    punctuations = r'[,.-]'
    
    author_name = sub(r'\d+',' ', name)
    author_name = sub(punctuations,' ', author_name)  
    author_name = sub(months,' ', author_name)
    author_name = sub(usual_words,' ', author_name)
    author_name = sub(temp_words,' ', author_name)

    return author_name.strip()

def verify_usual_tags(tag_name, soup):
    ''' Verify common html tags which used to have author's name '''
    
    attributes_list = ['name', 'rel', 'itemprop', 'class', 'id']
    values_list = ['author', 'byline', 'dc.creator']
    author_name = None
    for t in soup.find_all(tag_name):
        for attr in attributes_list:
            for vals in values_list:
                if ((t.get(attr) != None) and
                    (t.get(attr)[0].find('comment') < 0) and
                    (t.get(attr)[0].find(vals) >= 0)):
                    author_name = str(t)
                    author_name = sub('<[^<]+?>', ' ', author_name)
                    author_name = sub('[<>|:/]|[bB][yY]|[fF]rom','',author_name)
    return author_name

def verify_usual_words(html_content):
    ''' Verify common words which used to have author's name (ex: By/From) '''  
    
    author_name = None    
    pattern = '[bB][yY][\:\s]|[fF]rom[\:\s]'
    match = search(pattern, html_content)
    if(match):
        pos_ini, pos_fim = match.span()[0], match.span()[1]
        line = html_content[pos_ini:pos_fim+100].replace('\n','')
        search_str = sub('<[^<]+?>', ' ', "<"+line+">")
        search_str = sub('[<>|:/]|[bB][yY]|[fF]rom','',search_str)
        author_name = search_str
    return author_name

def get_authors(html_content):
    ''' Get new's author name '''
    is_author, author = '',0
    soup = BeautifulSoup(html_content, 'html.parser')      
     
    #Method 1: Popular authors tags
    popular_authos_tags = ['span','a','p']
    for tag in popular_authos_tags:
        result = verify_usual_tags(tag, soup)
        if(result):
            author = clean_author_name(result)
            #return author
        
    #Method 2: Search for by/from in tags content        
    result = verify_usual_words(html_content)
    if(result):
        author = clean_author_name(result)
        #return author

    if(author != ''):
        is_author = 1
    else:
        is_author = 0  
    return is_author

def get_url_domain(url):
    pos_ini = url.find('://')+3
    pos_end = pos_ini+url[pos_ini:].find('/')
    url_domain = url[pos_ini:pos_end]
    
    return url_domain

def set_news_attr(attr):
    value = ''
    if (attr != None):
        value = attr
    return value

def count_tag_occurr(soup, list_tags):
    ratio, aux, occurr = 0.0, 0.0, []

    for tag in list_tags:
        
        if(soup.find_all(tag) != None):
            occurr.append(len(soup.find_all(tag)))
            aux += len(soup.find_all(tag))
        else:
            occurr.append(0)
    
    #ratio = aux/total_tags
    ratio = aux
    return ratio


def counting_html_tags(html_content):
    
    filter_area = BeautifulSoup(html_content, 'html.parser').find_all('body')
    
    if(len(filter_area) <=0):
        return []
    
    html_body_content = str(filter_area[0])
    soup = BeautifulSoup(html_body_content, 'html.parser')

    basic = ['title','h1','h2','h3','h4','h5','h6','p','br','hr']

    formatting = ['acronym','abbr', 'address','b','bdi','bdo','big','blockquote','center',
                 'cite','code','del','dfn','em','font','i','ins','kbd','mark','meter','pre',
                 'progress','q','rp','rt','ruby','s','samp','small','strike','strong',
                 'sub','sup','template','time','tt','u','var','wbr']

    forms_inputs = ['form','input','textarea','button','select','optgroup','option',
                       'label','fieldset','legend','datalist','output']

    frames = ['frame','frameset','noframes','iframe']

    images = ['img','map','area','canvas','figcaption','figure','picture','svg']

    audio_video = ['audio','source','track','video']

    links = ['a','link','nav']

    lists = ['ul','ol','li','dir','dl','dt','dd','menu','menuitem']

    tables = ['table','caption','th','tr','td','thead','tbody','tfoot','col','colgroup']

    styles_semantics = ['style','div','span','header','footer','main','section','article','aside',
                       'details','dialog','summary','data']

    meta_info = ['head','meta','base','basefont']

    programming = ['script','noscript','applet','embed','object','param']

    ratio_basic = count_tag_occurr(soup, basic)
    ratio_formatting = count_tag_occurr(soup, formatting )
    ratio_forms_inputs = count_tag_occurr(soup, forms_inputs )
    ratio_frames = count_tag_occurr(soup, frames )
    ratio_images = count_tag_occurr(soup, images )
    ratio_audio_video = count_tag_occurr(soup, audio_video )
    ratio_links = count_tag_occurr(soup, links )
    ratio_lists = count_tag_occurr(soup, lists )
    ratio_tables = count_tag_occurr(soup, tables )
    ratio_styles_semantics = count_tag_occurr(soup, styles_semantics )
    ratio_meta_info = count_tag_occurr(soup, meta_info )
    ratio_programming_info = count_tag_occurr(soup, programming )
    nro_ads = count_ads(soup)

    return [ratio_basic, ratio_formatting, ratio_forms_inputs, ratio_frames,ratio_images, 
            ratio_audio_video, ratio_links, ratio_lists, ratio_tables, ratio_styles_semantics,
            ratio_meta_info,ratio_programming_info, nro_ads]

def get_url_info(html, url):
    ''' Get new's attributes '''
    try:
        article = Article('')
        article.set_html(u""+html)
        article.parse()

        url_domain = get_url_domain(url) 
        headline = set_news_attr(article.title)
        author = get_authors(article.html)
        content = set_news_attr(article.text)
        publish_date = article.publish_date

        if (publish_date):
            publish_date = publish_date.strftime("%Y-%m-%d %H:%M:%S")
        else:
            publish_date = ''
        html_count_info = counting_html_tags(article.html)

        if(html_count_info != []):
            return [url, headline, content, publish_date, author]+html_count_info
        else:
            return []
    except:
        print('Empty html!')
        return None  
    #return None

def store_info_urls(urls_info_list, name_csv_file):
    arq_out  = open(name_csv_file, "w")
    writer_out = writer(arq_out, delimiter=',', quoting=QUOTE_ALL)

    for url in urls_info_list:
        writer_out.writerow(tuple(url))
    
    arq_out.close()
 
    
def get_info_from_url(url):
    ''' Get new's attributes '''
    try:
        print(url)
        article = Article(url)
        article.download()
        article.html
        #print(article.html)
        #article = Article('')
        #article.set_html(u""+html)
        article.parse()
        print ('article--')
        #url_domain = get_url_domain(url) 
        headline = set_news_attr(article.title)
        author = get_authors(article.html)
        content = set_news_attr(article.text)
        publish_date = article.publish_date
        if (publish_date):
            publish_date = publish_date.strftime("%Y-%m-%d %H:%M:%S")
        else:
            publish_date = ''
        html_count_info = counting_html_tags(article.html)
        if(html_count_info != []):
            return [url, headline, content, publish_date, author]+html_count_info
        else:
            return []
    except:
        print('Empty html!')
        return None  
    #return None


### BASIC AND WEB-MARKUP FEATURES

#### Basic features:
- url = t[0]
- headline = t[1] 
- content = t[2] 
- publish_date = t[3] 
- author = t[4] 

#### Web-markup features:
- freq_basic = t[5] 
- freq_formatting = t[6] 
- freq_forms_inputs = t[7] 
- freq_frames = t[8] 
- freq_images = t[9] 
- freq_audio_video = t[10] 
- freq_links = t[11] 
- freq_lists = t[12] 
- freq_tables = t[13] 
- freq_styles_semantics = t[14] 
- freq_meta_info = t[15] 
- freq_programming_info = t[16] 
- freq_ads = t[17] 

### Extracting features (basics and web-markup features) from a WARC file
 <b>Function: </b> get_WebFeatures_from_WARCfile(file_name, output_file_name) <br />
 <b>Parameters:</b> <br />
 <b>file_name:</b> WARC file, <br />
 <b>output_file_name:</b> csv file where the features will be save


In [6]:
def get_WebFeatures_from_WARCfile(file_name, output_file_name):
    urls_info_list, id_url = [], 1
    
    with open(file_name, 'rb') as stream:
        for record in ArchiveIterator(stream):
            print(int(record.length))
            if int(record.length) > 0 :
                
                html_content = record.content_stream().read().decode('utf-8','ignore')
                url_target = record.rec_headers['WARC-Target-URI']
                #info_news = get_url_info(html_content,url)
                print(url_target)
                try:
                    info_news = get_url_info(html_content,url_target)
                    if (info_news):
                        urls_info_list.append(info_news)
                        id_url+=1

                    if(id_url % 500 == 0):
                        print(id_url)
                except:
                    print('Oppa',id_url)
    print("Writing...",len(urls_info_list))
    store_info_urls(urls_info_list, output_file_name)
    
    #store_info_urls(urls_info_list, output_file_name)

In [None]:
path_warc_data = '/Users/.../Fakenews/paper_fn/Experiments/data/discovery_paper/'
output_file_name = '/Users/.../Fakenews/paper_fn/Experiments/data/discovery_paper/discoveryCrawlData_Test2019.csv'
urls_crawled = get_WebFeatures_from_WARCfile(path_warc_data+'crawl_data-20190223091156448-00240.warc.gz', output_file_name)

### Extracting features (basics and web-markup features) from a URL
 <b>Function: </b> get_WebFeatures_from_url(url_target, output_filename) <br />
 <b>Parameters:</b> <br />
     <b>url_target:</b> url, <br />
     <b>output_filename:</b> csv file where the features will be save

In [10]:
#download html from a urls
def get_WebFeatures_from_url(url_target, output_filename):
    urls_info_list = []        
    id =0

    try:
        info_news = get_info_from_url(url_target)
        if (info_news):
            urls_info_list.append(info_news)
            id+=1

        if(id % 500 == 0):
            print(id)
    except:
        print('Oppa',id)

    print("Writing...",len(urls_info_list))
    store_info_urls(urls_info_list, output_filename)

In [11]:
url = 'https://www.celebdirtylaundry.com/2017/brad-pitt-texts-jennifer-aniston-nonstop-seeks-intense-emotional-support-after-angelina-jolie-divorce/'
output_file_name = '/Users/.../Fakenews/paper_fn/.../test_fake_celebrity.csv'
get_WebFeatures_from_url(url, output_file_name)


https://www.celebdirtylaundry.com/2017/brad-pitt-texts-jennifer-aniston-nonstop-seeks-intense-emotional-support-after-angelina-jolie-divorce/
article--
Writing... 1


### Extracting features (basics and web-markup features) from HTML files. A json file MUST be included inside the directory that contains the hmtls files.

 <b>Function: </b> get_WebFeatures_from_directory_json(dir_data, output_filename) <br />
 <b>Parameters:</b> <br />
 <b>dir_data</b>: Directory path that contains the html files as well as the json file. The json file contains an array objects where each object contains html and url attributes. Json file example:  
 {data: [ {html: 1.html, url: www.notice.com/note1}, ... , {html: 2.html, url: www.notice2.com/note2}] }
 <br />
 <b>output_filename:</b> csv file where the features will be save

In [7]:
from os import chdir, listdir, environ, makedirs, rename, chmod, walk, remove, path
import codecs
import json


#Getting columns (headline, description, [tags_features]) from a directory that contains json's files (with url and html attributes per line)
def get_WebFeatures_from_directory_json(dir_data, output_filename):
    urls_info_list, id_url = [], 1
    for (dirpath, dirnames, filenames) in walk(dir_data):
        for json_file in filenames:
            path_json = path.join(dirpath, json_file)
            filename = path_json
            print (json_file)
            if json_file.endswith('.json'):
                with open(filename) as json_file:
#                     try:
                    data = json.load(json_file)
                    for p in data['data']:
                        url, html_path = p['url'], p['html']
                        html_content = ""
                        with open(path.join(dirpath, html_path)) as fin:
                            html_content = fin.read()
                        info_news = get_url_info(html_content,url)
                        if(info_news):
                            urls_info_list.append(info_news)
                            #print(info_news)
                            id_url += 1 
                            print (str(id_url) + ", ")
                            #break
#                     except:
#                         print('Empty html!')
            #break
        #break
                #name_file_csv = path_output_directory+json_file[:-4]+'csv' 
                #print (name_file_csv)
    store_info_urls(urls_info_list, output_filename)
    print ("done") 
       

In [None]:
dir_data = '/Users/.../Fakenews/paper_fn/OfficialVersionExperiment/data/celebrity_html_files_v2019/fake_news/'
output_filename = dir_data + 'Celebrity_fake_updated2019.csv'
get_WebFeatures_from_directory_json(dir_data, output_filename)

dir_data = '/Users/.../Fakenews/paper_fn/OfficialVersionExperiment/data/celebrity_html_files_v2019/real_news/'
output_filename = dir_data + 'Celebrity_real_updated2019.csv'
get_WebFeatures_from_directory_json(dir_data, output_filename)